From 1fe02a1171b868152574a4662078c0ae21fe10a9 Mon Sep 17 00:00:00 2001 From: Azim Afroozeh Date: Tue, 11 Jun 2024 10:07:29 -0400 Subject: [PATCH] init v_0_1_4 --- .clang-format | 36 + .clang-tidy | 101 + .github/workflows/CI.yaml | 210 + .gitignore | 124 + BENCHMARKING.md | 136 + CMakeLists.txt | 78 + LICENSE | 21 + PRIMITIVES.md | 144 + README.md | 97 + benchmarks/CMakeLists.txt | 30 + .../analyze_better_blocks/.clang-format | 0 benchmarks/analyze_better_blocks/.clang-tidy | 0 benchmarks/analyze_better_blocks/.gitignore | 0 benchmarks/analyze_better_blocks/.projectile | 0 benchmarks/analyze_better_blocks/.rtags | 0 .../analyze_better_blocks/CMakeLists.txt | 134 + benchmarks/analyze_better_blocks/README.md | 0 .../alp/manual/CMakeLists.txt | 0 .../alp/manual/bench_ped.cpp | 184 + .../alp/manual/duplicate.cpp | 154 + .../alp/manual/include/alp.hpp | 1159 + .../alp/manual/include/dataset.hpp | 44 + .../alp/manual/include/datasets.hpp | 448 + .../alp/manual/include/datasets_complete.hpp | 434 + .../alp/manual/test_ped.cpp | 163 + .../alp_bench/alp_bench.hpp | 2260 + .../alp_pub/results/i4i/README.md | 0 .../alp_pub/results/i4i/ped.csv | 0 .../alp_pub/results/i4i/ped.metadata | 0 .../alp_pub/results/president/ped.csv | 0 .../benchmark/local.cmake | 0 .../benchmark/placeholder.cpp | 10 + .../cengine/CMakeLists.txt | 0 .../cengine/analysis/Analysis.cpp | 202 + .../cengine/analysis/Analysis.hpp | 133 + .../cengine/analysis/StringStats.cpp | 205 + .../cengine/datablock/BtrReader.cpp | 212 + .../cengine/datablock/BtrReader.hpp | 49 + .../cengine/datablock/CMachine.hpp | 32 + .../cengine/datablock/Datablock.cpp | 405 + .../cengine/datablock/Datablock.hpp | 73 + .../cengine/datablock/cache/ThreadCache.cpp | 65 + .../cengine/datablock/cache/ThreadCache.hpp | 52 + .../cengine/datablock/schemes/CScheme.cpp | 47 + .../cengine/datablock/schemes/CScheme.hpp | 147 + .../datablock/schemes/CSchemePicker.cpp | 22 + .../datablock/schemes/CSchemePicker.hpp | 291 + .../cengine/datablock/schemes/CSchemePool.cpp | 167 + .../cengine/datablock/schemes/CSchemePool.hpp | 22 + .../datablock/schemes/DoubleSchemeType.cpp | 36 + .../datablock/schemes/DoubleSchemeType.hpp | 21 + .../datablock/schemes/IntegerSchemeType.cpp | 42 + .../datablock/schemes/IntegerSchemeType.hpp | 23 + .../datablock/schemes/StringSchemeType.cpp | 28 + .../datablock/schemes/StringSchemeType.hpp | 16 + .../schemes/v1/double/Dictionary.hpp | 64 + .../datablock/schemes/v1/double/OneValue.cpp | 47 + .../datablock/schemes/v1/double/OneValue.hpp | 34 + .../schemes/v1/double/Uncompressed.cpp | 35 + .../schemes/v1/double/Uncompressed.hpp | 30 + .../schemes/v1/integer/Dictionary.hpp | 73 + .../datablock/schemes/v1/integer/OneValue.cpp | 50 + .../datablock/schemes/v1/integer/OneValue.hpp | 36 + .../schemes/v1/integer/Truncation.cpp | 56 + .../schemes/v1/integer/Truncation.hpp | 117 + .../schemes/v1/integer/Uncompressed.cpp | 39 + .../schemes/v1/integer/Uncompressed.hpp | 33 + .../schemes/v1/string/Dictionary.cpp | 71 + .../schemes/v1/string/Dictionary.hpp | 50 + .../datablock/schemes/v1/string/OneValue.cpp | 177 + .../datablock/schemes/v1/string/OneValue.hpp | 39 + .../schemes/v1/string/Uncompressed.cpp | 49 + .../schemes/v1/string/Uncompressed.hpp | 37 + .../schemes/v1/templated/FixedDictionary.hpp | 74 + .../schemes/v1/templated/VarDictionary.hpp | 91 + .../schemes/v2/bitmap/RoaringBitmap.cpp | 149 + .../schemes/v2/bitmap/RoaringBitmap.hpp | 42 + .../datablock/schemes/v2/double/Decimal.cpp | 460 + .../datablock/schemes/v2/double/Decimal.hpp | 58 + .../datablock/schemes/v2/double/DoubleBP.cpp | 60 + .../datablock/schemes/v2/double/DoubleBP.hpp | 37 + .../schemes/v2/double/DynamicDictionary.cpp | 44 + .../schemes/v2/double/DynamicDictionary.hpp | 31 + .../datablock/schemes/v2/double/Frequency.cpp | 49 + .../datablock/schemes/v2/double/Frequency.hpp | 31 + .../datablock/schemes/v2/double/GDHacky.xpp | 135 + .../datablock/schemes/v2/double/Hacky.cpp | 204 + .../datablock/schemes/v2/double/Hacky.hpp | 48 + .../schemes/v2/double/MaxExponent.cpp | 174 + .../schemes/v2/double/MaxExponent.hpp | 45 + .../datablock/schemes/v2/double/PBP.hpp | 6 + .../datablock/schemes/v2/double/RLE.cpp | 46 + .../datablock/schemes/v2/double/RLE.hpp | 31 + .../schemes/v2/integer/DynamicDictionary.cpp | 46 + .../schemes/v2/integer/DynamicDictionary.hpp | 39 + .../datablock/schemes/v2/integer/FOR.cpp | 76 + .../datablock/schemes/v2/integer/FOR.hpp | 39 + .../schemes/v2/integer/Frequency.cpp | 51 + .../schemes/v2/integer/Frequency.hpp | 41 + .../datablock/schemes/v2/integer/PBP.cpp | 209 + .../datablock/schemes/v2/integer/PBP.hpp | 91 + .../datablock/schemes/v2/integer/RLE.cpp | 54 + .../datablock/schemes/v2/integer/RLE.hpp | 42 + .../schemes/v2/string/DynamicDictionary.cpp | 572 + .../schemes/v2/string/DynamicDictionary.hpp | 50 + .../datablock/schemes/v2/string/Fsst.cpp | 107 + .../datablock/schemes/v2/string/Fsst.hpp | 33 + .../v2/templated/DynamicDictionary.hpp | 200 + .../datablock/schemes/v2/templated/FOR.hpp | 58 + .../schemes/v2/templated/Frequency.hpp | 144 + .../datablock/schemes/v2/templated/RLE.hpp | 211 + .../cengine/datablock/stats/NumberStats.hpp | 134 + .../cengine/datablock/stats/StringStats.cpp | 39 + .../cengine/datablock/stats/StringStats.hpp | 28 + .../cengine/extern/BZIP2.cpp | 18 + .../cengine/extern/BZIP2.hpp | 8 + .../cengine/extern/FastPFOR.cpp | 60 + .../cengine/extern/FastPFOR.hpp | 35 + .../cengine/extern/LZ4.cpp | 16 + .../cengine/extern/LZ4.hpp | 8 + .../cengine/extern/XZ.cpp | 9 + .../cengine/extern/XZ.hpp | 8 + .../cengine/parser/CSVParser.hpp | 375 + .../cengine/parser/Parser.cpp | 187 + .../cengine/parser/Parser.hpp | 25 + .../cengine/parser/Trim.hpp | 48 + .../cengine/storage/Chunk.cpp | 274 + .../cengine/storage/Chunk.hpp | 106 + .../cengine/storage/Column.cpp | 66 + .../cengine/storage/Column.hpp | 26 + .../cengine/storage/Relation.cpp | 224 + .../cengine/storage/Relation.hpp | 38 + .../cengine/storage/StringArrayViewer.hpp | 44 + .../storage/StringPointerArrayViewer.hpp | 27 + .../cengine/utils/Utils.cpp | 24 + .../cengine/utils/Utils.hpp | 126 + .../analyze_better_blocks/cmake/.gitkeep | 0 .../cmake/clang-tidy.cmake | 0 .../analyze_better_blocks/codestyle.xml | 58 + benchmarks/analyze_better_blocks/harbook.cpp | 329 + .../playground/.dir-locals.el | 0 .../playground/double.cpp | 120 + .../playground/double_benchmarking.cpp | 75 + .../playground/fetch-cols.sh | 0 .../playground/fetch-double-cols.sh | 0 .../playground/for_tests.cpp | 56 + .../playground/fsst_0.cpp | 35 + .../playground/fsst_benchmark.cpp | 250 + .../playground/generate_s3_data.cpp | 189 + .../playground/local.cmake | 0 .../playground/pbi-double-columns.txt | 0 .../playground/pbi-string-columns.txt | 0 .../playground/playground.cpp | 124 + .../playground/pseudodecimal_benchmark.cpp | 176 + .../analyze_better_blocks/playground/rle.cpp | 130 + .../playground/s3-columns.txt | 0 .../playground/sampling_algorithms.cpp | 326 + .../analyze_better_blocks/playground/tbb.cpp | 36 + .../playground/test-s3-crt.cpp | 263 + .../playground/test-s3-custom-stream.cpp | 99 + .../playground/test-s3-transfer.cpp | 93 + .../playground/test-s3.cpp | 78 + .../shared-headers/Exceptions.hpp | 23 + .../shared-headers/PerfEvent.hpp | 247 + .../shared-headers/PerfExternal.hpp | 45 + .../shared-headers/Reinterpret.hpp | 7 + .../shared-headers/SIMD.hpp | 28 + .../shared-headers/Units.hpp | 125 + .../shared-headers/local.cmake | 0 .../analyze_better_blocks/test/CMakeLists.txt | 0 .../test/DatasetGenerator.cpp | 366 + .../test/test-cases/TestHelper.cpp | 31 + .../test/test-cases/TestHelper.hpp | 13 + .../test/test-cases/V1.cpp | 161 + .../test/test-cases/V2.cpp | 110 + .../analyze_better_blocks/test/tester.cpp | 25 + .../tools/CMakeLists.txt | 0 .../analysis/r-scripts/dataset_distribution.r | 0 .../dataset_distribution_compressed.r | 0 .../analysis/r-scripts/estimation_deviation.r | 0 .../tools/analysis/r-scripts/overnight.r | 0 .../analysis/r-scripts/sample_parameters.r | 0 .../analysis/r-scripts/schemes_plugging.csv | 0 .../analysis/r-scripts/schemes_plugging.r | 0 .../r-scripts/total_compression_factor.r | 0 .../r-scripts/total_compression_factor.tsv | 0 .../tools/analysis/r-scripts/tzt.r | 0 .../analysis/r-scripts/varying_block_size.csv | 0 .../analysis/r-scripts/varying_block_size.r | 0 .../r-scripts/varying_sample_parameter.csv | 0 .../tools/conversion/CMakeLists.txt | 0 .../tools/conversion/PerfEvent.hpp | 268 + .../tools/conversion/btrmeta.cpp | 97 + .../tools/conversion/btrtocsv.cpp | 167 + .../tools/conversion/compare_csvs.py | 0 .../tools/conversion/csvtobtr.cpp | 214 + .../conversion/decompression-speed-s3.cpp | 181 + .../tools/conversion/decompression-speed.cpp | 241 + .../tools/conversion/s3-management.hpp | 207 + .../tools/datasets/.gitignore | 0 .../tools/datasets/CMakeLists.txt | 0 .../tools/datasets/prepare_dataset.sh | 0 .../tools/datasets/stats/CMakeLists.txt | 0 .../stats/double-stats/CMakeLists.txt | 0 .../stats/double-stats/DecimalApplication.cpp | 119 + .../stats/double-stats/DoubleStatsExec.cpp | 113 + .../tools/datasets/stats/double_stats.py | 0 .../tools/datasets/stats/final/all_double.ods | 0 .../datasets/stats/final/all_doubles.csv | 0 .../datasets/stats/final/all_integer.csv | 0 .../datasets/stats/final/all_integers.ods | 0 .../tools/datasets/stats/final/all_string.ods | 0 .../datasets/stats/final/all_string_stats.csv | 0 .../datasets/stats/final/all_string_tmp.ods | 0 .../stats/final/brute_force_results.csv | 0 .../datasets/stats/final/bzip_doubles.csv | 0 .../datasets/stats/final/bzip_integers.csv | 0 .../datasets/stats/final/bzip_strings.csv | 0 .../final/compression_ratio_analysis.ods | 0 .../stats/final/dict_before_after_tzt.ods | 0 .../dict_sharing_compression_ratio_2.csv | 0 .../datasets/stats/final/double_columns_count | 0 .../stats/final/integer_columns_count | 0 .../stats/final/integers_generico.csv | 0 .../stats/final/integers_generico.ods | 0 .../tools/datasets/stats/final/lz4_double.csv | 0 .../datasets/stats/final/lz4_integer.csv | 0 .../datasets/stats/final/raw/db1_integers.csv | 0 .../datasets/stats/final/raw/db2_integers.csv | 0 .../datasets/stats/final/string_columns_count | 0 .../datasets/stats/final/strings_size.csv | 0 .../final/vardict_8_16_with_without_tzt.ods | 0 .../tools/datasets/stats/final/xz.csv | 0 .../tools/datasets/stats/final/xz.ods | 0 .../stats/integer-stats/CMakeLists.txt | 0 .../stats/integer-stats/IntegerStats.cpp | 196 + .../tools/datasets/stats/integer_stats.py | 0 .../stats/string-stats/CMakeLists.txt | 0 .../stats/string-stats/StringFSST.cpp | 106 + .../stats/string-stats/StringSharing.cpp | 92 + .../stats/string-stats/StringStats.cpp | 192 + .../tools/datasets/string_stats.sh | 0 .../tools/engine-comparison/note.txt | 0 .../tools/list-s3-btrfiles.sh | 0 .../tools/misc/create_table_parser.cpp | 12 + .../tools/misc/local.cmake | 0 .../tools/mmapvector/CMakeLists.txt | 0 .../tools/mmapvector/MMapVector.cpp | 28 + .../tools/mmapvector/MMapvector.hpp | 118 + .../tools/prepare-ec2-instance.sh | 0 benchmarks/analyze_better_blocks/tools/r.bash | 0 .../analyze_better_blocks/tools/stats.py | 0 .../vendor/aws-sdk.cmake | 0 .../vendor/benchmark.cmake | 0 .../vendor/croaring.cmake | 0 .../vendor/fastpfor.cmake | 0 .../analyze_better_blocks/vendor/fsst.cmake | 0 .../vendor/gdouble.cmake | 0 .../analyze_better_blocks/vendor/gflags.cmake | 0 .../vendor/googletest.cmake | 0 .../analyze_better_blocks/vendor/lz4.cmake | 0 .../vendor/rapidjson.cmake | 0 .../analyze_better_blocks/vendor/spdlog.cmake | 0 .../analyze_better_blocks/vendor/tbb.cmake | 0 .../analyze_better_blocks/vendor/turbo.cmake | 0 .../vendor/yaml-cpp.cmake | 0 .../bench_compression_ratio/CMakeLists.txt | 37 + benchmarks/bench_compression_ratio/alp.cpp | 284 + benchmarks/bench_compression_ratio/alp32.cpp | 212 + benchmarks/bench_compression_ratio/chimp.cpp | 153 + .../bench_compression_ratio/chimp128.cpp | 186 + .../bench_compression_ratio/gorillas.cpp | 133 + benchmarks/bench_compression_ratio/patas.cpp | 139 + benchmarks/bench_compression_ratio/zstd.cpp | 90 + benchmarks/bench_speed/CMakeLists.txt | 53 + .../bench_speed/bench_alp_cutter_decode.cpp | 168 + .../bench_speed/bench_alp_cutter_encode.cpp | 164 + benchmarks/bench_speed/bench_alp_encode.cpp | 132 + .../bench_alp_without_sampling.cpp | 177 + benchmarks/bench_speed/bench_chimp.cpp | 209 + benchmarks/bench_speed/bench_chimp128.cpp | 307 + benchmarks/bench_speed/bench_gorillas.cpp | 167 + benchmarks/bench_speed/bench_patas.cpp | 154 + benchmarks/bench_speed/bench_zstd.cpp | 108 + benchmarks/fls_bench/LICENSE | 21 + benchmarks/fls_bench/fls_bench.hpp | 2261 + benchmarks/fls_bench/google/benchmark/LICENSE | 202 + benchmarks/include/alp_result.hpp | 49 + benchmarks/include/chimp/bit_reader.hpp | 152 + benchmarks/include/chimp/bit_utils.hpp | 20 + benchmarks/include/chimp/byte_reader.hpp | 121 + benchmarks/include/chimp/byte_writer.hpp | 56 + benchmarks/include/chimp/chimp.hpp | 242 + benchmarks/include/chimp/chimp128.hpp | 294 + benchmarks/include/chimp/chimp_utils.hpp | 139 + benchmarks/include/chimp/flag_buffer.hpp | 125 + .../include/chimp/leading_zero_buffer.hpp | 155 + .../include/chimp/output_bit_stream.hpp | 178 + benchmarks/include/chimp/packed_data.hpp | 87 + benchmarks/include/chimp/ring_buffer.hpp | 54 + benchmarks/include/duckdb/assert.hpp | 16 + benchmarks/include/duckdb/common.hpp | 4 + benchmarks/include/duckdb/constants.hpp | 132 + benchmarks/include/duckdb/duckdb.h | 2304 ++ benchmarks/include/duckdb/exception.hpp | 354 + .../include/duckdb/exception_format_value.hpp | 51 + benchmarks/include/duckdb/fast_mem.hpp | 686 + benchmarks/include/duckdb/helper.hpp | 124 + benchmarks/include/duckdb/likely.hpp | 10 + benchmarks/include/duckdb/limits.hpp | 100 + .../include/duckdb/single_thread_ptr.hpp | 164 + benchmarks/include/duckdb/string.hpp | 8 + benchmarks/include/duckdb/to_string.hpp | 5 + benchmarks/include/duckdb/types.hpp | 487 + benchmarks/include/duckdb/validity_mask.hpp | 291 + benchmarks/include/duckdb/vector.hpp | 7 + benchmarks/include/duckdb/vector_size.hpp | 19 + benchmarks/include/duckdb/winapi.hpp | 25 + benchmarks/include/gorillas/gorillas.hpp | 224 + .../include/gorillas/gorillas_utils.hpp | 70 + benchmarks/include/patas/patas.hpp | 124 + benchmarks/test/CMakeLists.txt | 20 + benchmarks/test/test_chimp.cpp | 114 + benchmarks/test/test_chimp128.cpp | 150 + benchmarks/test/test_gorillas.cpp | 98 + benchmarks/test/test_patas.cpp | 105 + data/datasets_transformer.ipynb | 900 + data/edge_case/edge_case.csv | 1025 + data/generated/generated_doubles_bw0.csv | 1024 + data/generated/generated_doubles_bw1.csv | 1024 + data/generated/generated_doubles_bw10.csv | 1024 + data/generated/generated_doubles_bw11.csv | 1024 + data/generated/generated_doubles_bw12.csv | 1024 + data/generated/generated_doubles_bw13.csv | 1024 + data/generated/generated_doubles_bw14.csv | 1024 + data/generated/generated_doubles_bw15.csv | 1024 + data/generated/generated_doubles_bw16.csv | 1024 + data/generated/generated_doubles_bw17.csv | 1024 + data/generated/generated_doubles_bw18.csv | 1024 + data/generated/generated_doubles_bw19.csv | 1024 + data/generated/generated_doubles_bw2.csv | 1024 + data/generated/generated_doubles_bw20.csv | 1024 + data/generated/generated_doubles_bw21.csv | 1024 + data/generated/generated_doubles_bw22.csv | 1024 + data/generated/generated_doubles_bw23.csv | 1024 + data/generated/generated_doubles_bw24.csv | 1024 + data/generated/generated_doubles_bw25.csv | 1024 + data/generated/generated_doubles_bw26.csv | 1024 + data/generated/generated_doubles_bw27.csv | 1024 + data/generated/generated_doubles_bw28.csv | 1024 + data/generated/generated_doubles_bw29.csv | 1024 + data/generated/generated_doubles_bw3.csv | 1024 + data/generated/generated_doubles_bw30.csv | 1024 + data/generated/generated_doubles_bw31.csv | 1024 + data/generated/generated_doubles_bw32.csv | 1024 + data/generated/generated_doubles_bw33.csv | 1024 + data/generated/generated_doubles_bw34.csv | 1024 + data/generated/generated_doubles_bw35.csv | 1024 + data/generated/generated_doubles_bw36.csv | 1024 + data/generated/generated_doubles_bw37.csv | 1024 + data/generated/generated_doubles_bw38.csv | 1024 + data/generated/generated_doubles_bw39.csv | 1024 + data/generated/generated_doubles_bw4.csv | 1024 + data/generated/generated_doubles_bw40.csv | 1024 + data/generated/generated_doubles_bw41.csv | 1024 + data/generated/generated_doubles_bw42.csv | 1024 + data/generated/generated_doubles_bw43.csv | 1024 + data/generated/generated_doubles_bw44.csv | 1024 + data/generated/generated_doubles_bw45.csv | 1024 + data/generated/generated_doubles_bw46.csv | 1024 + data/generated/generated_doubles_bw47.csv | 1024 + data/generated/generated_doubles_bw48.csv | 1024 + data/generated/generated_doubles_bw49.csv | 1024 + data/generated/generated_doubles_bw5.csv | 1024 + data/generated/generated_doubles_bw50.csv | 1024 + data/generated/generated_doubles_bw51.csv | 1024 + data/generated/generated_doubles_bw52.csv | 1024 + data/generated/generated_doubles_bw53.csv | 1024 + data/generated/generated_doubles_bw54.csv | 1024 + data/generated/generated_doubles_bw55.csv | 1024 + data/generated/generated_doubles_bw56.csv | 1024 + data/generated/generated_doubles_bw57.csv | 1024 + data/generated/generated_doubles_bw58.csv | 1024 + data/generated/generated_doubles_bw59.csv | 1024 + data/generated/generated_doubles_bw6.csv | 1024 + data/generated/generated_doubles_bw60.csv | 1024 + data/generated/generated_doubles_bw61.csv | 1024 + data/generated/generated_doubles_bw62.csv | 1024 + data/generated/generated_doubles_bw63.csv | 1024 + data/generated/generated_doubles_bw64.csv | 1024 + data/generated/generated_doubles_bw7.csv | 1024 + data/generated/generated_doubles_bw8.csv | 1024 + data/generated/generated_doubles_bw9.csv | 1024 + data/include/column.hpp | 38 + data/include/data.hpp | 10 + data/include/double_columns.hpp | 286 + data/include/edge_case.hpp | 13 + data/include/float_columns.hpp | 19 + data/include/generated_columns.hpp | 80 + data/samples/air_sensor_f.csv | 1024 + data/samples/arade4.csv | 1024 + data/samples/basel_temp_f.csv | 1024 + data/samples/basel_wind_f.csv | 1024 + data/samples/bird_migration_f.csv | 1024 + data/samples/bitcoin_f.csv | 1024 + data/samples/bitcoin_transactions_f.csv | 1024 + data/samples/city_temperature_f.csv | 1024 + data/samples/cms1.csv | 1024 + data/samples/cms25.csv | 1024 + data/samples/cms9.csv | 1024 + data/samples/food_prices.csv | 1024 + data/samples/gov10.csv | 1024 + data/samples/gov26.csv | 1024 + data/samples/gov30.csv | 1024 + data/samples/gov31.csv | 1024 + data/samples/gov40.csv | 1024 + data/samples/medicare1.csv | 1024 + data/samples/medicare9.csv | 1024 + data/samples/neon_air_pressure.csv | 1024 + data/samples/neon_bio_temp_c.csv | 1024 + data/samples/neon_dew_point_temp.csv | 1024 + data/samples/neon_pm10_dust.csv | 1024 + data/samples/neon_wind_dir.csv | 1024 + data/samples/nyc29.csv | 1024 + data/samples/poi_lat.csv | 1024 + data/samples/poi_lon.csv | 1024 + data/samples/ssd_hdd_benchmarks_f.csv | 1024 + data/samples/stocks_de.csv | 1024 + data/samples/stocks_uk.csv | 1024 + data/samples/stocks_usa_c.csv | 1024 + example/CMakeLists.txt | 14 + example/adaptive_compress.cpp | 48 + example/include/helper.hpp | 25 + example/rd_compress.cpp | 46 + example/rd_compress32.cpp | 46 + example/simple_compress.cpp | 46 + example/simple_compress32.cpp | 46 + generated/CMakeLists.txt | 50 + generated/alp_generated.cpp | 0 generated/arm64v8/CMakeLists.txt | 8 + generated/arm64v8/arm64v8.cpp | 0 .../arm64v8/neon_intrinsic_uf1/CMakeLists.txt | 50 + ...4v8_neon_intrinsic_1024_uf1_falp_bench.cpp | 130 + ...m64v8_neon_intrinsic_1024_uf1_falp_src.cpp | 29670 ++++++++++++++ ...64v8_neon_intrinsic_1024_uf1_falp_test.cpp | 119 + .../arm64v8/neon_intrinsic_uf1/falp.cmake | 32 + .../arm64v8/sve_intrinsic_uf1/CMakeLists.txt | 50 + ...64v8_sve_intrinsic_1024_uf1_falp_bench.cpp | 128 + ...rm64v8_sve_intrinsic_1024_uf1_falp_src.cpp | 29738 ++++++++++++++ ...m64v8_sve_intrinsic_1024_uf1_falp_test.cpp | 119 + .../arm64v8/sve_intrinsic_uf1/falp.cmake | 31 + generated/fallback/CMakeLists.txt | 11 + generated/fallback/fallback.cpp | 0 .../fallback/scalar_aav_uf1/CMakeLists.txt | 50 + .../fallback_scalar_aav_1024_uf1_falp_src.cpp | 33954 +++++++++++++++ ...fallback_scalar_aav_1024_uf1_falp_test.cpp | 112 + generated/fallback/scalar_aav_uf1/falp.cmake | 16 + .../fallback/scalar_nav_uf1/CMakeLists.txt | 50 + ...allback_scalar_nav_1024_uf1_falp_bench.cpp | 128 + .../fallback_scalar_nav_1024_uf1_falp_src.cpp | 33764 +++++++++++++++ ...fallback_scalar_nav_1024_uf1_falp_test.cpp | 119 + generated/fallback/scalar_nav_uf1/falp.cmake | 31 + generated/generated_files.txt | 36 + include/alp.hpp | 18 + include/alp/common.hpp | 19 + include/alp/compressor.hpp | 212 + include/alp/config.hpp | 28 + include/alp/constants.hpp | 242 + include/alp/decode.hpp | 130 + include/alp/decompressor.hpp | 154 + include/alp/encode.hpp | 448 + include/alp/falp.hpp | 111 + include/alp/rd.hpp | 183 + include/alp/sampler.hpp | 52 + include/alp/state.hpp | 37 + include/alp/storer.hpp | 56 + include/alp/utils.hpp | 74 + include/fastlanes/ffor.hpp | 21 + include/fastlanes/macros.hpp | 17 + include/fastlanes/unffor.hpp | 20 + publication/alp_compression_ratio.csv | 29 + publication/alp_rd32_compression_ratio.csv | 5 + publication/alp_rd_compression_ratio.csv | 3 + publication/alp_results.png | Bin 0 -> 224138 bytes publication/chimp128_compression_ratio.csv | 31 + publication/chimp_compression_ratio.csv | 31 + publication/gorillas_compression_ratio.csv | 31 + publication/l.py | 11 + publication/patas_compression_ratio.csv | 31 + publication/results/c6g/README.md | 50 + .../arm64v8_neon_intrinsic_1024_uf1_falp.csv | 63 + ...64v8_neon_intrinsic_1024_uf1_falp.metadata | 27 + .../c6g/fallback_scalar_aav_1024_uf1_falp.csv | 63 + ...fallback_scalar_aav_1024_uf1_falp.metadata | 27 + .../c6g/fallback_scalar_nav_1024_uf1_falp.csv | 63 + ...fallback_scalar_nav_1024_uf1_falp.metadata | 27 + publication/results/c7g/README.md | 47 + .../arm64v8_neon_intrinsic_1024_uf1_falp.csv | 63 + ...64v8_neon_intrinsic_1024_uf1_falp.metadata | 27 + .../c7g/fallback_scalar_aav_1024_uf1_falp.csv | 63 + ...fallback_scalar_aav_1024_uf1_falp.metadata | 27 + .../c7g/fallback_scalar_nav_1024_uf1_falp.csv | 63 + ...fallback_scalar_nav_1024_uf1_falp.metadata | 27 + publication/results/i4i/README.md | 8 + publication/results/i4i/alp_decode_cutter.csv | 4 + .../results/i4i/alp_decode_cutter.metadata | 27 + publication/results/i4i/alp_encode.csv | 29 + publication/results/i4i/alp_encode.metadata | 27 + publication/results/i4i/alp_encode_cutter.csv | 4 + .../results/i4i/alp_encode_cutter.metadata | 27 + publication/results/i4i/alp_encode_pde.csv | 29 + .../results/i4i/alp_encode_pde.metadata | 27 + .../i4i/alp_encode_without_sampling.csv | 29 + .../i4i/alp_encode_without_sampling.metadata | 27 + publication/results/i4i/chimp.csv | 63 + publication/results/i4i/chimp.metadata | 27 + publication/results/i4i/chimp128.csv | 63 + publication/results/i4i/chimp128.metadata | 27 + .../i4i/fallback_scalar_aav_1024_uf1_falp.csv | 63 + ...fallback_scalar_aav_1024_uf1_falp.metadata | 27 + .../i4i/fallback_scalar_nav_1024_uf1_falp.csv | 63 + ...fallback_scalar_nav_1024_uf1_falp.metadata | 27 + publication/results/i4i/gorillas.csv | 63 + publication/results/i4i/gorillas.metadata | 27 + publication/results/i4i/patas.csv | 63 + publication/results/i4i/patas.metadata | 27 + publication/results/i4i/ped.csv | 63 + publication/results/i4i/ped.metadata | 27 + ...86_64_avx512bw_intrinsic_1024_uf1_falp.csv | 193 + ..._avx512bw_intrinsic_1024_uf1_falp.metadata | 27 + publication/results/i4i_4xlarge/README.md | 8 + .../results/i4i_4xlarge/alp_decode_cutter.csv | 4 + .../i4i_4xlarge/alp_decode_cutter.metadata | 27 + .../results/i4i_4xlarge/alp_encode.csv | 29 + .../results/i4i_4xlarge/alp_encode.metadata | 27 + .../results/i4i_4xlarge/alp_encode_pde.csv | 29 + .../i4i_4xlarge/alp_encode_pde.metadata | 27 + .../alp_encode_without_sampling.csv | 29 + .../alp_encode_without_sampling.metadata | 27 + ...86_64_avx512bw_intrinsic_1024_uf1_falp.csv | 63 + ..._avx512bw_intrinsic_1024_uf1_falp.metadata | 27 + publication/results/m1/README.md | 5 + publication/results/m1/alp_encode.csv | 32 + publication/results/m1/alp_encode.metadata | 26 + .../arm64v8_neon_intrinsic_1024_uf1_falp.csv | 57 + ...64v8_neon_intrinsic_1024_uf1_falp.metadata | 26 + .../m1/fallback_scalar_aav_1024_uf1_falp.csv | 57 + ...fallback_scalar_aav_1024_uf1_falp.metadata | 26 + .../m1/fallback_scalar_nav_1024_uf1_falp.csv | 57 + ...fallback_scalar_nav_1024_uf1_falp.metadata | 26 + publication/results/m6a_xlarge/README.md | 5 + .../fallback_scalar_aav_1024_uf1_falp.csv | 63 + ...fallback_scalar_aav_1024_uf1_falp.metadata | 27 + .../fallback_scalar_nav_1024_uf1_falp.csv | 63 + ...fallback_scalar_nav_1024_uf1_falp.metadata | 27 + .../x86_64_avx2_intrinsic_1024_uf1_falp.csv | 63 + ...6_64_avx2_intrinsic_1024_uf1_falp.metadata | 27 + .../fallback_scalar_aav_1024_uf1_falp.csv | 63 + ...fallback_scalar_aav_1024_uf1_falp.metadata | 28 + ...86_64_avx512bw_intrinsic_1024_uf1_falp.csv | 63 + ..._avx512bw_intrinsic_1024_uf1_falp.metadata | 27 + publication/tmp/i4i_4xlarge/README.md | 8 + ...86_64_avx512bw_intrinsic_1024_uf1_falp.csv | 63 + ..._avx512bw_intrinsic_1024_uf1_falp.metadata | 27 + .../fallback_scalar_aav_1024_uf1_falp.csv | 63 + ...fallback_scalar_aav_1024_uf1_falp.metadata | 28 + ...86_64_avx512bw_intrinsic_1024_uf1_falp.csv | 63 + ..._avx512bw_intrinsic_1024_uf1_falp.metadata | 27 + publication/zstd_compression_ratio.csv | 31 + scripts/run-clang-format.py | 411 + src/CMakeLists.txt | 6 + src/falp.cpp | 33955 ++++++++++++++++ src/fastlanes_ffor.cpp | 36 + src/fastlanes_generated_ffor.cpp | 30138 ++++++++++++++ src/fastlanes_generated_unffor.cpp | 23212 +++++++++++ src/fastlanes_unffor.cpp | 36 + test/CMakeLists.txt | 3 + test/include/test/mapper.hpp | 27 + test/test_alp_sample.cpp | 188 + toolchain/example.cmake | 5 + toolchain/m1.cmake | 6 + 581 files changed, 356274 insertions(+) create mode 100644 .clang-format create mode 100644 .clang-tidy create mode 100644 .github/workflows/CI.yaml create mode 100644 .gitignore create mode 100644 BENCHMARKING.md create mode 100644 CMakeLists.txt create mode 100644 LICENSE create mode 100644 PRIMITIVES.md create mode 100644 README.md create mode 100644 benchmarks/CMakeLists.txt create mode 100644 benchmarks/analyze_better_blocks/.clang-format create mode 100644 benchmarks/analyze_better_blocks/.clang-tidy create mode 100644 benchmarks/analyze_better_blocks/.gitignore create mode 100644 benchmarks/analyze_better_blocks/.projectile create mode 100644 benchmarks/analyze_better_blocks/.rtags create mode 100644 benchmarks/analyze_better_blocks/CMakeLists.txt create mode 100644 benchmarks/analyze_better_blocks/README.md create mode 100644 benchmarks/analyze_better_blocks/alp/manual/CMakeLists.txt create mode 100644 benchmarks/analyze_better_blocks/alp/manual/bench_ped.cpp create mode 100644 benchmarks/analyze_better_blocks/alp/manual/duplicate.cpp create mode 100644 benchmarks/analyze_better_blocks/alp/manual/include/alp.hpp create mode 100644 benchmarks/analyze_better_blocks/alp/manual/include/dataset.hpp create mode 100644 benchmarks/analyze_better_blocks/alp/manual/include/datasets.hpp create mode 100644 benchmarks/analyze_better_blocks/alp/manual/include/datasets_complete.hpp create mode 100644 benchmarks/analyze_better_blocks/alp/manual/test_ped.cpp create mode 100644 benchmarks/analyze_better_blocks/alp_bench/alp_bench.hpp create mode 100644 benchmarks/analyze_better_blocks/alp_pub/results/i4i/README.md create mode 100644 benchmarks/analyze_better_blocks/alp_pub/results/i4i/ped.csv create mode 100644 benchmarks/analyze_better_blocks/alp_pub/results/i4i/ped.metadata create mode 100644 benchmarks/analyze_better_blocks/alp_pub/results/president/ped.csv create mode 100644 benchmarks/analyze_better_blocks/benchmark/local.cmake create mode 100644 benchmarks/analyze_better_blocks/benchmark/placeholder.cpp create mode 100644 benchmarks/analyze_better_blocks/cengine/CMakeLists.txt create mode 100644 benchmarks/analyze_better_blocks/cengine/analysis/Analysis.cpp create mode 100644 benchmarks/analyze_better_blocks/cengine/analysis/Analysis.hpp create mode 100644 benchmarks/analyze_better_blocks/cengine/analysis/StringStats.cpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/BtrReader.cpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/BtrReader.hpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/CMachine.hpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/Datablock.cpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/Datablock.hpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/cache/ThreadCache.cpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/cache/ThreadCache.hpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/schemes/CScheme.cpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/schemes/CScheme.hpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/schemes/CSchemePicker.cpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/schemes/CSchemePicker.hpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/schemes/CSchemePool.cpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/schemes/CSchemePool.hpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/schemes/DoubleSchemeType.cpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/schemes/DoubleSchemeType.hpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/schemes/IntegerSchemeType.cpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/schemes/IntegerSchemeType.hpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/schemes/StringSchemeType.cpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/schemes/StringSchemeType.hpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/double/Dictionary.hpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/double/OneValue.cpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/double/OneValue.hpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/double/Uncompressed.cpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/double/Uncompressed.hpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/integer/Dictionary.hpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/integer/OneValue.cpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/integer/OneValue.hpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/integer/Truncation.cpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/integer/Truncation.hpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/integer/Uncompressed.cpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/integer/Uncompressed.hpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/string/Dictionary.cpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/string/Dictionary.hpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/string/OneValue.cpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/string/OneValue.hpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/string/Uncompressed.cpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/string/Uncompressed.hpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/templated/FixedDictionary.hpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/templated/VarDictionary.hpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/bitmap/RoaringBitmap.cpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/bitmap/RoaringBitmap.hpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/double/Decimal.cpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/double/Decimal.hpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/double/DoubleBP.cpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/double/DoubleBP.hpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/double/DynamicDictionary.cpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/double/DynamicDictionary.hpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/double/Frequency.cpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/double/Frequency.hpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/double/GDHacky.xpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/double/Hacky.cpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/double/Hacky.hpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/double/MaxExponent.cpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/double/MaxExponent.hpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/double/PBP.hpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/double/RLE.cpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/double/RLE.hpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/integer/DynamicDictionary.cpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/integer/DynamicDictionary.hpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/integer/FOR.cpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/integer/FOR.hpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/integer/Frequency.cpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/integer/Frequency.hpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/integer/PBP.cpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/integer/PBP.hpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/integer/RLE.cpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/integer/RLE.hpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/string/DynamicDictionary.cpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/string/DynamicDictionary.hpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/string/Fsst.cpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/string/Fsst.hpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/templated/DynamicDictionary.hpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/templated/FOR.hpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/templated/Frequency.hpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/templated/RLE.hpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/stats/NumberStats.hpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/stats/StringStats.cpp create mode 100644 benchmarks/analyze_better_blocks/cengine/datablock/stats/StringStats.hpp create mode 100644 benchmarks/analyze_better_blocks/cengine/extern/BZIP2.cpp create mode 100644 benchmarks/analyze_better_blocks/cengine/extern/BZIP2.hpp create mode 100644 benchmarks/analyze_better_blocks/cengine/extern/FastPFOR.cpp create mode 100644 benchmarks/analyze_better_blocks/cengine/extern/FastPFOR.hpp create mode 100644 benchmarks/analyze_better_blocks/cengine/extern/LZ4.cpp create mode 100644 benchmarks/analyze_better_blocks/cengine/extern/LZ4.hpp create mode 100644 benchmarks/analyze_better_blocks/cengine/extern/XZ.cpp create mode 100644 benchmarks/analyze_better_blocks/cengine/extern/XZ.hpp create mode 100644 benchmarks/analyze_better_blocks/cengine/parser/CSVParser.hpp create mode 100644 benchmarks/analyze_better_blocks/cengine/parser/Parser.cpp create mode 100644 benchmarks/analyze_better_blocks/cengine/parser/Parser.hpp create mode 100644 benchmarks/analyze_better_blocks/cengine/parser/Trim.hpp create mode 100644 benchmarks/analyze_better_blocks/cengine/storage/Chunk.cpp create mode 100644 benchmarks/analyze_better_blocks/cengine/storage/Chunk.hpp create mode 100644 benchmarks/analyze_better_blocks/cengine/storage/Column.cpp create mode 100644 benchmarks/analyze_better_blocks/cengine/storage/Column.hpp create mode 100644 benchmarks/analyze_better_blocks/cengine/storage/Relation.cpp create mode 100644 benchmarks/analyze_better_blocks/cengine/storage/Relation.hpp create mode 100644 benchmarks/analyze_better_blocks/cengine/storage/StringArrayViewer.hpp create mode 100644 benchmarks/analyze_better_blocks/cengine/storage/StringPointerArrayViewer.hpp create mode 100644 benchmarks/analyze_better_blocks/cengine/utils/Utils.cpp create mode 100644 benchmarks/analyze_better_blocks/cengine/utils/Utils.hpp create mode 100644 benchmarks/analyze_better_blocks/cmake/.gitkeep create mode 100644 benchmarks/analyze_better_blocks/cmake/clang-tidy.cmake create mode 100644 benchmarks/analyze_better_blocks/codestyle.xml create mode 100644 benchmarks/analyze_better_blocks/harbook.cpp create mode 100644 benchmarks/analyze_better_blocks/playground/.dir-locals.el create mode 100644 benchmarks/analyze_better_blocks/playground/double.cpp create mode 100644 benchmarks/analyze_better_blocks/playground/double_benchmarking.cpp create mode 100644 benchmarks/analyze_better_blocks/playground/fetch-cols.sh create mode 100644 benchmarks/analyze_better_blocks/playground/fetch-double-cols.sh create mode 100644 benchmarks/analyze_better_blocks/playground/for_tests.cpp create mode 100644 benchmarks/analyze_better_blocks/playground/fsst_0.cpp create mode 100644 benchmarks/analyze_better_blocks/playground/fsst_benchmark.cpp create mode 100644 benchmarks/analyze_better_blocks/playground/generate_s3_data.cpp create mode 100644 benchmarks/analyze_better_blocks/playground/local.cmake create mode 100644 benchmarks/analyze_better_blocks/playground/pbi-double-columns.txt create mode 100644 benchmarks/analyze_better_blocks/playground/pbi-string-columns.txt create mode 100644 benchmarks/analyze_better_blocks/playground/playground.cpp create mode 100644 benchmarks/analyze_better_blocks/playground/pseudodecimal_benchmark.cpp create mode 100644 benchmarks/analyze_better_blocks/playground/rle.cpp create mode 100644 benchmarks/analyze_better_blocks/playground/s3-columns.txt create mode 100644 benchmarks/analyze_better_blocks/playground/sampling_algorithms.cpp create mode 100644 benchmarks/analyze_better_blocks/playground/tbb.cpp create mode 100644 benchmarks/analyze_better_blocks/playground/test-s3-crt.cpp create mode 100644 benchmarks/analyze_better_blocks/playground/test-s3-custom-stream.cpp create mode 100644 benchmarks/analyze_better_blocks/playground/test-s3-transfer.cpp create mode 100644 benchmarks/analyze_better_blocks/playground/test-s3.cpp create mode 100644 benchmarks/analyze_better_blocks/shared-headers/Exceptions.hpp create mode 100644 benchmarks/analyze_better_blocks/shared-headers/PerfEvent.hpp create mode 100644 benchmarks/analyze_better_blocks/shared-headers/PerfExternal.hpp create mode 100644 benchmarks/analyze_better_blocks/shared-headers/Reinterpret.hpp create mode 100644 benchmarks/analyze_better_blocks/shared-headers/SIMD.hpp create mode 100644 benchmarks/analyze_better_blocks/shared-headers/Units.hpp create mode 100644 benchmarks/analyze_better_blocks/shared-headers/local.cmake create mode 100644 benchmarks/analyze_better_blocks/test/CMakeLists.txt create mode 100644 benchmarks/analyze_better_blocks/test/DatasetGenerator.cpp create mode 100644 benchmarks/analyze_better_blocks/test/test-cases/TestHelper.cpp create mode 100644 benchmarks/analyze_better_blocks/test/test-cases/TestHelper.hpp create mode 100644 benchmarks/analyze_better_blocks/test/test-cases/V1.cpp create mode 100644 benchmarks/analyze_better_blocks/test/test-cases/V2.cpp create mode 100644 benchmarks/analyze_better_blocks/test/tester.cpp create mode 100644 benchmarks/analyze_better_blocks/tools/CMakeLists.txt create mode 100644 benchmarks/analyze_better_blocks/tools/analysis/r-scripts/dataset_distribution.r create mode 100644 benchmarks/analyze_better_blocks/tools/analysis/r-scripts/dataset_distribution_compressed.r create mode 100644 benchmarks/analyze_better_blocks/tools/analysis/r-scripts/estimation_deviation.r create mode 100644 benchmarks/analyze_better_blocks/tools/analysis/r-scripts/overnight.r create mode 100644 benchmarks/analyze_better_blocks/tools/analysis/r-scripts/sample_parameters.r create mode 100644 benchmarks/analyze_better_blocks/tools/analysis/r-scripts/schemes_plugging.csv create mode 100644 benchmarks/analyze_better_blocks/tools/analysis/r-scripts/schemes_plugging.r create mode 100644 benchmarks/analyze_better_blocks/tools/analysis/r-scripts/total_compression_factor.r create mode 100644 benchmarks/analyze_better_blocks/tools/analysis/r-scripts/total_compression_factor.tsv create mode 100644 benchmarks/analyze_better_blocks/tools/analysis/r-scripts/tzt.r create mode 100644 benchmarks/analyze_better_blocks/tools/analysis/r-scripts/varying_block_size.csv create mode 100644 benchmarks/analyze_better_blocks/tools/analysis/r-scripts/varying_block_size.r create mode 100644 benchmarks/analyze_better_blocks/tools/analysis/r-scripts/varying_sample_parameter.csv create mode 100644 benchmarks/analyze_better_blocks/tools/conversion/CMakeLists.txt create mode 100644 benchmarks/analyze_better_blocks/tools/conversion/PerfEvent.hpp create mode 100644 benchmarks/analyze_better_blocks/tools/conversion/btrmeta.cpp create mode 100644 benchmarks/analyze_better_blocks/tools/conversion/btrtocsv.cpp create mode 100644 benchmarks/analyze_better_blocks/tools/conversion/compare_csvs.py create mode 100644 benchmarks/analyze_better_blocks/tools/conversion/csvtobtr.cpp create mode 100644 benchmarks/analyze_better_blocks/tools/conversion/decompression-speed-s3.cpp create mode 100644 benchmarks/analyze_better_blocks/tools/conversion/decompression-speed.cpp create mode 100644 benchmarks/analyze_better_blocks/tools/conversion/s3-management.hpp create mode 100644 benchmarks/analyze_better_blocks/tools/datasets/.gitignore create mode 100644 benchmarks/analyze_better_blocks/tools/datasets/CMakeLists.txt create mode 100644 benchmarks/analyze_better_blocks/tools/datasets/prepare_dataset.sh create mode 100644 benchmarks/analyze_better_blocks/tools/datasets/stats/CMakeLists.txt create mode 100644 benchmarks/analyze_better_blocks/tools/datasets/stats/double-stats/CMakeLists.txt create mode 100644 benchmarks/analyze_better_blocks/tools/datasets/stats/double-stats/DecimalApplication.cpp create mode 100644 benchmarks/analyze_better_blocks/tools/datasets/stats/double-stats/DoubleStatsExec.cpp create mode 100644 benchmarks/analyze_better_blocks/tools/datasets/stats/double_stats.py create mode 100644 benchmarks/analyze_better_blocks/tools/datasets/stats/final/all_double.ods create mode 100644 benchmarks/analyze_better_blocks/tools/datasets/stats/final/all_doubles.csv create mode 100644 benchmarks/analyze_better_blocks/tools/datasets/stats/final/all_integer.csv create mode 100644 benchmarks/analyze_better_blocks/tools/datasets/stats/final/all_integers.ods create mode 100644 benchmarks/analyze_better_blocks/tools/datasets/stats/final/all_string.ods create mode 100644 benchmarks/analyze_better_blocks/tools/datasets/stats/final/all_string_stats.csv create mode 100644 benchmarks/analyze_better_blocks/tools/datasets/stats/final/all_string_tmp.ods create mode 100644 benchmarks/analyze_better_blocks/tools/datasets/stats/final/brute_force_results.csv create mode 100644 benchmarks/analyze_better_blocks/tools/datasets/stats/final/bzip_doubles.csv create mode 100644 benchmarks/analyze_better_blocks/tools/datasets/stats/final/bzip_integers.csv create mode 100644 benchmarks/analyze_better_blocks/tools/datasets/stats/final/bzip_strings.csv create mode 100644 benchmarks/analyze_better_blocks/tools/datasets/stats/final/compression_ratio_analysis.ods create mode 100644 benchmarks/analyze_better_blocks/tools/datasets/stats/final/dict_before_after_tzt.ods create mode 100644 benchmarks/analyze_better_blocks/tools/datasets/stats/final/dict_sharing_compression_ratio_2.csv create mode 100644 benchmarks/analyze_better_blocks/tools/datasets/stats/final/double_columns_count create mode 100644 benchmarks/analyze_better_blocks/tools/datasets/stats/final/integer_columns_count create mode 100644 benchmarks/analyze_better_blocks/tools/datasets/stats/final/integers_generico.csv create mode 100644 benchmarks/analyze_better_blocks/tools/datasets/stats/final/integers_generico.ods create mode 100644 benchmarks/analyze_better_blocks/tools/datasets/stats/final/lz4_double.csv create mode 100644 benchmarks/analyze_better_blocks/tools/datasets/stats/final/lz4_integer.csv create mode 100644 benchmarks/analyze_better_blocks/tools/datasets/stats/final/raw/db1_integers.csv create mode 100644 benchmarks/analyze_better_blocks/tools/datasets/stats/final/raw/db2_integers.csv create mode 100644 benchmarks/analyze_better_blocks/tools/datasets/stats/final/string_columns_count create mode 100644 benchmarks/analyze_better_blocks/tools/datasets/stats/final/strings_size.csv create mode 100644 benchmarks/analyze_better_blocks/tools/datasets/stats/final/vardict_8_16_with_without_tzt.ods create mode 100644 benchmarks/analyze_better_blocks/tools/datasets/stats/final/xz.csv create mode 100644 benchmarks/analyze_better_blocks/tools/datasets/stats/final/xz.ods create mode 100644 benchmarks/analyze_better_blocks/tools/datasets/stats/integer-stats/CMakeLists.txt create mode 100644 benchmarks/analyze_better_blocks/tools/datasets/stats/integer-stats/IntegerStats.cpp create mode 100644 benchmarks/analyze_better_blocks/tools/datasets/stats/integer_stats.py create mode 100644 benchmarks/analyze_better_blocks/tools/datasets/stats/string-stats/CMakeLists.txt create mode 100644 benchmarks/analyze_better_blocks/tools/datasets/stats/string-stats/StringFSST.cpp create mode 100644 benchmarks/analyze_better_blocks/tools/datasets/stats/string-stats/StringSharing.cpp create mode 100644 benchmarks/analyze_better_blocks/tools/datasets/stats/string-stats/StringStats.cpp create mode 100644 benchmarks/analyze_better_blocks/tools/datasets/string_stats.sh create mode 100644 benchmarks/analyze_better_blocks/tools/engine-comparison/note.txt create mode 100644 benchmarks/analyze_better_blocks/tools/list-s3-btrfiles.sh create mode 100644 benchmarks/analyze_better_blocks/tools/misc/create_table_parser.cpp create mode 100644 benchmarks/analyze_better_blocks/tools/misc/local.cmake create mode 100644 benchmarks/analyze_better_blocks/tools/mmapvector/CMakeLists.txt create mode 100644 benchmarks/analyze_better_blocks/tools/mmapvector/MMapVector.cpp create mode 100644 benchmarks/analyze_better_blocks/tools/mmapvector/MMapvector.hpp create mode 100644 benchmarks/analyze_better_blocks/tools/prepare-ec2-instance.sh create mode 100644 benchmarks/analyze_better_blocks/tools/r.bash create mode 100644 benchmarks/analyze_better_blocks/tools/stats.py create mode 100644 benchmarks/analyze_better_blocks/vendor/aws-sdk.cmake create mode 100644 benchmarks/analyze_better_blocks/vendor/benchmark.cmake create mode 100644 benchmarks/analyze_better_blocks/vendor/croaring.cmake create mode 100644 benchmarks/analyze_better_blocks/vendor/fastpfor.cmake create mode 100644 benchmarks/analyze_better_blocks/vendor/fsst.cmake create mode 100644 benchmarks/analyze_better_blocks/vendor/gdouble.cmake create mode 100644 benchmarks/analyze_better_blocks/vendor/gflags.cmake create mode 100644 benchmarks/analyze_better_blocks/vendor/googletest.cmake create mode 100644 benchmarks/analyze_better_blocks/vendor/lz4.cmake create mode 100644 benchmarks/analyze_better_blocks/vendor/rapidjson.cmake create mode 100644 benchmarks/analyze_better_blocks/vendor/spdlog.cmake create mode 100644 benchmarks/analyze_better_blocks/vendor/tbb.cmake create mode 100644 benchmarks/analyze_better_blocks/vendor/turbo.cmake create mode 100644 benchmarks/analyze_better_blocks/vendor/yaml-cpp.cmake create mode 100644 benchmarks/bench_compression_ratio/CMakeLists.txt create mode 100644 benchmarks/bench_compression_ratio/alp.cpp create mode 100644 benchmarks/bench_compression_ratio/alp32.cpp create mode 100644 benchmarks/bench_compression_ratio/chimp.cpp create mode 100644 benchmarks/bench_compression_ratio/chimp128.cpp create mode 100644 benchmarks/bench_compression_ratio/gorillas.cpp create mode 100644 benchmarks/bench_compression_ratio/patas.cpp create mode 100644 benchmarks/bench_compression_ratio/zstd.cpp create mode 100644 benchmarks/bench_speed/CMakeLists.txt create mode 100644 benchmarks/bench_speed/bench_alp_cutter_decode.cpp create mode 100644 benchmarks/bench_speed/bench_alp_cutter_encode.cpp create mode 100644 benchmarks/bench_speed/bench_alp_encode.cpp create mode 100644 benchmarks/bench_speed/bench_alp_without_sampling.cpp create mode 100644 benchmarks/bench_speed/bench_chimp.cpp create mode 100644 benchmarks/bench_speed/bench_chimp128.cpp create mode 100644 benchmarks/bench_speed/bench_gorillas.cpp create mode 100644 benchmarks/bench_speed/bench_patas.cpp create mode 100644 benchmarks/bench_speed/bench_zstd.cpp create mode 100644 benchmarks/fls_bench/LICENSE create mode 100644 benchmarks/fls_bench/fls_bench.hpp create mode 100644 benchmarks/fls_bench/google/benchmark/LICENSE create mode 100644 benchmarks/include/alp_result.hpp create mode 100644 benchmarks/include/chimp/bit_reader.hpp create mode 100644 benchmarks/include/chimp/bit_utils.hpp create mode 100644 benchmarks/include/chimp/byte_reader.hpp create mode 100644 benchmarks/include/chimp/byte_writer.hpp create mode 100644 benchmarks/include/chimp/chimp.hpp create mode 100644 benchmarks/include/chimp/chimp128.hpp create mode 100644 benchmarks/include/chimp/chimp_utils.hpp create mode 100644 benchmarks/include/chimp/flag_buffer.hpp create mode 100644 benchmarks/include/chimp/leading_zero_buffer.hpp create mode 100644 benchmarks/include/chimp/output_bit_stream.hpp create mode 100644 benchmarks/include/chimp/packed_data.hpp create mode 100644 benchmarks/include/chimp/ring_buffer.hpp create mode 100644 benchmarks/include/duckdb/assert.hpp create mode 100644 benchmarks/include/duckdb/common.hpp create mode 100644 benchmarks/include/duckdb/constants.hpp create mode 100644 benchmarks/include/duckdb/duckdb.h create mode 100644 benchmarks/include/duckdb/exception.hpp create mode 100644 benchmarks/include/duckdb/exception_format_value.hpp create mode 100644 benchmarks/include/duckdb/fast_mem.hpp create mode 100644 benchmarks/include/duckdb/helper.hpp create mode 100644 benchmarks/include/duckdb/likely.hpp create mode 100644 benchmarks/include/duckdb/limits.hpp create mode 100644 benchmarks/include/duckdb/single_thread_ptr.hpp create mode 100644 benchmarks/include/duckdb/string.hpp create mode 100644 benchmarks/include/duckdb/to_string.hpp create mode 100644 benchmarks/include/duckdb/types.hpp create mode 100644 benchmarks/include/duckdb/validity_mask.hpp create mode 100644 benchmarks/include/duckdb/vector.hpp create mode 100644 benchmarks/include/duckdb/vector_size.hpp create mode 100644 benchmarks/include/duckdb/winapi.hpp create mode 100644 benchmarks/include/gorillas/gorillas.hpp create mode 100644 benchmarks/include/gorillas/gorillas_utils.hpp create mode 100644 benchmarks/include/patas/patas.hpp create mode 100644 benchmarks/test/CMakeLists.txt create mode 100644 benchmarks/test/test_chimp.cpp create mode 100644 benchmarks/test/test_chimp128.cpp create mode 100644 benchmarks/test/test_gorillas.cpp create mode 100644 benchmarks/test/test_patas.cpp create mode 100644 data/datasets_transformer.ipynb create mode 100644 data/edge_case/edge_case.csv create mode 100644 data/generated/generated_doubles_bw0.csv create mode 100644 data/generated/generated_doubles_bw1.csv create mode 100644 data/generated/generated_doubles_bw10.csv create mode 100644 data/generated/generated_doubles_bw11.csv create mode 100644 data/generated/generated_doubles_bw12.csv create mode 100644 data/generated/generated_doubles_bw13.csv create mode 100644 data/generated/generated_doubles_bw14.csv create mode 100644 data/generated/generated_doubles_bw15.csv create mode 100644 data/generated/generated_doubles_bw16.csv create mode 100644 data/generated/generated_doubles_bw17.csv create mode 100644 data/generated/generated_doubles_bw18.csv create mode 100644 data/generated/generated_doubles_bw19.csv create mode 100644 data/generated/generated_doubles_bw2.csv create mode 100644 data/generated/generated_doubles_bw20.csv create mode 100644 data/generated/generated_doubles_bw21.csv create mode 100644 data/generated/generated_doubles_bw22.csv create mode 100644 data/generated/generated_doubles_bw23.csv create mode 100644 data/generated/generated_doubles_bw24.csv create mode 100644 data/generated/generated_doubles_bw25.csv create mode 100644 data/generated/generated_doubles_bw26.csv create mode 100644 data/generated/generated_doubles_bw27.csv create mode 100644 data/generated/generated_doubles_bw28.csv create mode 100644 data/generated/generated_doubles_bw29.csv create mode 100644 data/generated/generated_doubles_bw3.csv create mode 100644 data/generated/generated_doubles_bw30.csv create mode 100644 data/generated/generated_doubles_bw31.csv create mode 100644 data/generated/generated_doubles_bw32.csv create mode 100644 data/generated/generated_doubles_bw33.csv create mode 100644 data/generated/generated_doubles_bw34.csv create mode 100644 data/generated/generated_doubles_bw35.csv create mode 100644 data/generated/generated_doubles_bw36.csv create mode 100644 data/generated/generated_doubles_bw37.csv create mode 100644 data/generated/generated_doubles_bw38.csv create mode 100644 data/generated/generated_doubles_bw39.csv create mode 100644 data/generated/generated_doubles_bw4.csv create mode 100644 data/generated/generated_doubles_bw40.csv create mode 100644 data/generated/generated_doubles_bw41.csv create mode 100644 data/generated/generated_doubles_bw42.csv create mode 100644 data/generated/generated_doubles_bw43.csv create mode 100644 data/generated/generated_doubles_bw44.csv create mode 100644 data/generated/generated_doubles_bw45.csv create mode 100644 data/generated/generated_doubles_bw46.csv create mode 100644 data/generated/generated_doubles_bw47.csv create mode 100644 data/generated/generated_doubles_bw48.csv create mode 100644 data/generated/generated_doubles_bw49.csv create mode 100644 data/generated/generated_doubles_bw5.csv create mode 100644 data/generated/generated_doubles_bw50.csv create mode 100644 data/generated/generated_doubles_bw51.csv create mode 100644 data/generated/generated_doubles_bw52.csv create mode 100644 data/generated/generated_doubles_bw53.csv create mode 100644 data/generated/generated_doubles_bw54.csv create mode 100644 data/generated/generated_doubles_bw55.csv create mode 100644 data/generated/generated_doubles_bw56.csv create mode 100644 data/generated/generated_doubles_bw57.csv create mode 100644 data/generated/generated_doubles_bw58.csv create mode 100644 data/generated/generated_doubles_bw59.csv create mode 100644 data/generated/generated_doubles_bw6.csv create mode 100644 data/generated/generated_doubles_bw60.csv create mode 100644 data/generated/generated_doubles_bw61.csv create mode 100644 data/generated/generated_doubles_bw62.csv create mode 100644 data/generated/generated_doubles_bw63.csv create mode 100644 data/generated/generated_doubles_bw64.csv create mode 100644 data/generated/generated_doubles_bw7.csv create mode 100644 data/generated/generated_doubles_bw8.csv create mode 100644 data/generated/generated_doubles_bw9.csv create mode 100644 data/include/column.hpp create mode 100644 data/include/data.hpp create mode 100644 data/include/double_columns.hpp create mode 100644 data/include/edge_case.hpp create mode 100644 data/include/float_columns.hpp create mode 100644 data/include/generated_columns.hpp create mode 100644 data/samples/air_sensor_f.csv create mode 100644 data/samples/arade4.csv create mode 100644 data/samples/basel_temp_f.csv create mode 100644 data/samples/basel_wind_f.csv create mode 100644 data/samples/bird_migration_f.csv create mode 100644 data/samples/bitcoin_f.csv create mode 100644 data/samples/bitcoin_transactions_f.csv create mode 100644 data/samples/city_temperature_f.csv create mode 100644 data/samples/cms1.csv create mode 100644 data/samples/cms25.csv create mode 100644 data/samples/cms9.csv create mode 100644 data/samples/food_prices.csv create mode 100644 data/samples/gov10.csv create mode 100644 data/samples/gov26.csv create mode 100644 data/samples/gov30.csv create mode 100644 data/samples/gov31.csv create mode 100644 data/samples/gov40.csv create mode 100644 data/samples/medicare1.csv create mode 100644 data/samples/medicare9.csv create mode 100644 data/samples/neon_air_pressure.csv create mode 100644 data/samples/neon_bio_temp_c.csv create mode 100644 data/samples/neon_dew_point_temp.csv create mode 100644 data/samples/neon_pm10_dust.csv create mode 100644 data/samples/neon_wind_dir.csv create mode 100644 data/samples/nyc29.csv create mode 100644 data/samples/poi_lat.csv create mode 100644 data/samples/poi_lon.csv create mode 100644 data/samples/ssd_hdd_benchmarks_f.csv create mode 100644 data/samples/stocks_de.csv create mode 100644 data/samples/stocks_uk.csv create mode 100644 data/samples/stocks_usa_c.csv create mode 100644 example/CMakeLists.txt create mode 100644 example/adaptive_compress.cpp create mode 100644 example/include/helper.hpp create mode 100644 example/rd_compress.cpp create mode 100644 example/rd_compress32.cpp create mode 100644 example/simple_compress.cpp create mode 100644 example/simple_compress32.cpp create mode 100644 generated/CMakeLists.txt create mode 100644 generated/alp_generated.cpp create mode 100644 generated/arm64v8/CMakeLists.txt create mode 100644 generated/arm64v8/arm64v8.cpp create mode 100644 generated/arm64v8/neon_intrinsic_uf1/CMakeLists.txt create mode 100644 generated/arm64v8/neon_intrinsic_uf1/arm64v8_neon_intrinsic_1024_uf1_falp_bench.cpp create mode 100644 generated/arm64v8/neon_intrinsic_uf1/arm64v8_neon_intrinsic_1024_uf1_falp_src.cpp create mode 100644 generated/arm64v8/neon_intrinsic_uf1/arm64v8_neon_intrinsic_1024_uf1_falp_test.cpp create mode 100644 generated/arm64v8/neon_intrinsic_uf1/falp.cmake create mode 100644 generated/arm64v8/sve_intrinsic_uf1/CMakeLists.txt create mode 100644 generated/arm64v8/sve_intrinsic_uf1/arm64v8_sve_intrinsic_1024_uf1_falp_bench.cpp create mode 100644 generated/arm64v8/sve_intrinsic_uf1/arm64v8_sve_intrinsic_1024_uf1_falp_src.cpp create mode 100644 generated/arm64v8/sve_intrinsic_uf1/arm64v8_sve_intrinsic_1024_uf1_falp_test.cpp create mode 100644 generated/arm64v8/sve_intrinsic_uf1/falp.cmake create mode 100644 generated/fallback/CMakeLists.txt create mode 100644 generated/fallback/fallback.cpp create mode 100644 generated/fallback/scalar_aav_uf1/CMakeLists.txt create mode 100644 generated/fallback/scalar_aav_uf1/fallback_scalar_aav_1024_uf1_falp_src.cpp create mode 100644 generated/fallback/scalar_aav_uf1/fallback_scalar_aav_1024_uf1_falp_test.cpp create mode 100644 generated/fallback/scalar_aav_uf1/falp.cmake create mode 100644 generated/fallback/scalar_nav_uf1/CMakeLists.txt create mode 100644 generated/fallback/scalar_nav_uf1/fallback_scalar_nav_1024_uf1_falp_bench.cpp create mode 100644 generated/fallback/scalar_nav_uf1/fallback_scalar_nav_1024_uf1_falp_src.cpp create mode 100644 generated/fallback/scalar_nav_uf1/fallback_scalar_nav_1024_uf1_falp_test.cpp create mode 100644 generated/fallback/scalar_nav_uf1/falp.cmake create mode 100644 generated/generated_files.txt create mode 100644 include/alp.hpp create mode 100644 include/alp/common.hpp create mode 100644 include/alp/compressor.hpp create mode 100644 include/alp/config.hpp create mode 100644 include/alp/constants.hpp create mode 100644 include/alp/decode.hpp create mode 100644 include/alp/decompressor.hpp create mode 100644 include/alp/encode.hpp create mode 100644 include/alp/falp.hpp create mode 100644 include/alp/rd.hpp create mode 100644 include/alp/sampler.hpp create mode 100644 include/alp/state.hpp create mode 100644 include/alp/storer.hpp create mode 100644 include/alp/utils.hpp create mode 100644 include/fastlanes/ffor.hpp create mode 100644 include/fastlanes/macros.hpp create mode 100644 include/fastlanes/unffor.hpp create mode 100644 publication/alp_compression_ratio.csv create mode 100644 publication/alp_rd32_compression_ratio.csv create mode 100644 publication/alp_rd_compression_ratio.csv create mode 100644 publication/alp_results.png create mode 100644 publication/chimp128_compression_ratio.csv create mode 100644 publication/chimp_compression_ratio.csv create mode 100644 publication/gorillas_compression_ratio.csv create mode 100644 publication/l.py create mode 100644 publication/patas_compression_ratio.csv create mode 100644 publication/results/c6g/README.md create mode 100644 publication/results/c6g/arm64v8_neon_intrinsic_1024_uf1_falp.csv create mode 100644 publication/results/c6g/arm64v8_neon_intrinsic_1024_uf1_falp.metadata create mode 100644 publication/results/c6g/fallback_scalar_aav_1024_uf1_falp.csv create mode 100644 publication/results/c6g/fallback_scalar_aav_1024_uf1_falp.metadata create mode 100644 publication/results/c6g/fallback_scalar_nav_1024_uf1_falp.csv create mode 100644 publication/results/c6g/fallback_scalar_nav_1024_uf1_falp.metadata create mode 100644 publication/results/c7g/README.md create mode 100644 publication/results/c7g/arm64v8_neon_intrinsic_1024_uf1_falp.csv create mode 100644 publication/results/c7g/arm64v8_neon_intrinsic_1024_uf1_falp.metadata create mode 100644 publication/results/c7g/fallback_scalar_aav_1024_uf1_falp.csv create mode 100644 publication/results/c7g/fallback_scalar_aav_1024_uf1_falp.metadata create mode 100644 publication/results/c7g/fallback_scalar_nav_1024_uf1_falp.csv create mode 100644 publication/results/c7g/fallback_scalar_nav_1024_uf1_falp.metadata create mode 100644 publication/results/i4i/README.md create mode 100644 publication/results/i4i/alp_decode_cutter.csv create mode 100644 publication/results/i4i/alp_decode_cutter.metadata create mode 100644 publication/results/i4i/alp_encode.csv create mode 100644 publication/results/i4i/alp_encode.metadata create mode 100644 publication/results/i4i/alp_encode_cutter.csv create mode 100644 publication/results/i4i/alp_encode_cutter.metadata create mode 100644 publication/results/i4i/alp_encode_pde.csv create mode 100644 publication/results/i4i/alp_encode_pde.metadata create mode 100644 publication/results/i4i/alp_encode_without_sampling.csv create mode 100644 publication/results/i4i/alp_encode_without_sampling.metadata create mode 100644 publication/results/i4i/chimp.csv create mode 100644 publication/results/i4i/chimp.metadata create mode 100644 publication/results/i4i/chimp128.csv create mode 100644 publication/results/i4i/chimp128.metadata create mode 100644 publication/results/i4i/fallback_scalar_aav_1024_uf1_falp.csv create mode 100644 publication/results/i4i/fallback_scalar_aav_1024_uf1_falp.metadata create mode 100644 publication/results/i4i/fallback_scalar_nav_1024_uf1_falp.csv create mode 100644 publication/results/i4i/fallback_scalar_nav_1024_uf1_falp.metadata create mode 100644 publication/results/i4i/gorillas.csv create mode 100644 publication/results/i4i/gorillas.metadata create mode 100644 publication/results/i4i/patas.csv create mode 100644 publication/results/i4i/patas.metadata create mode 100644 publication/results/i4i/ped.csv create mode 100644 publication/results/i4i/ped.metadata create mode 100644 publication/results/i4i/x86_64_avx512bw_intrinsic_1024_uf1_falp.csv create mode 100644 publication/results/i4i/x86_64_avx512bw_intrinsic_1024_uf1_falp.metadata create mode 100644 publication/results/i4i_4xlarge/README.md create mode 100644 publication/results/i4i_4xlarge/alp_decode_cutter.csv create mode 100644 publication/results/i4i_4xlarge/alp_decode_cutter.metadata create mode 100644 publication/results/i4i_4xlarge/alp_encode.csv create mode 100644 publication/results/i4i_4xlarge/alp_encode.metadata create mode 100644 publication/results/i4i_4xlarge/alp_encode_pde.csv create mode 100644 publication/results/i4i_4xlarge/alp_encode_pde.metadata create mode 100644 publication/results/i4i_4xlarge/alp_encode_without_sampling.csv create mode 100644 publication/results/i4i_4xlarge/alp_encode_without_sampling.metadata create mode 100644 publication/results/i4i_4xlarge/x86_64_avx512bw_intrinsic_1024_uf1_falp.csv create mode 100644 publication/results/i4i_4xlarge/x86_64_avx512bw_intrinsic_1024_uf1_falp.metadata create mode 100644 publication/results/m1/README.md create mode 100644 publication/results/m1/alp_encode.csv create mode 100644 publication/results/m1/alp_encode.metadata create mode 100644 publication/results/m1/arm64v8_neon_intrinsic_1024_uf1_falp.csv create mode 100644 publication/results/m1/arm64v8_neon_intrinsic_1024_uf1_falp.metadata create mode 100644 publication/results/m1/fallback_scalar_aav_1024_uf1_falp.csv create mode 100644 publication/results/m1/fallback_scalar_aav_1024_uf1_falp.metadata create mode 100644 publication/results/m1/fallback_scalar_nav_1024_uf1_falp.csv create mode 100644 publication/results/m1/fallback_scalar_nav_1024_uf1_falp.metadata create mode 100644 publication/results/m6a_xlarge/README.md create mode 100644 publication/results/m6a_xlarge/fallback_scalar_aav_1024_uf1_falp.csv create mode 100644 publication/results/m6a_xlarge/fallback_scalar_aav_1024_uf1_falp.metadata create mode 100644 publication/results/m6a_xlarge/fallback_scalar_nav_1024_uf1_falp.csv create mode 100644 publication/results/m6a_xlarge/fallback_scalar_nav_1024_uf1_falp.metadata create mode 100644 publication/results/m6a_xlarge/x86_64_avx2_intrinsic_1024_uf1_falp.csv create mode 100644 publication/results/m6a_xlarge/x86_64_avx2_intrinsic_1024_uf1_falp.metadata create mode 100644 publication/results/president/fallback_scalar_aav_1024_uf1_falp.csv create mode 100644 publication/results/president/fallback_scalar_aav_1024_uf1_falp.metadata create mode 100644 publication/results/president/x86_64_avx512bw_intrinsic_1024_uf1_falp.csv create mode 100644 publication/results/president/x86_64_avx512bw_intrinsic_1024_uf1_falp.metadata create mode 100644 publication/tmp/i4i_4xlarge/README.md create mode 100644 publication/tmp/i4i_4xlarge/x86_64_avx512bw_intrinsic_1024_uf1_falp.csv create mode 100644 publication/tmp/i4i_4xlarge/x86_64_avx512bw_intrinsic_1024_uf1_falp.metadata create mode 100644 publication/tmp/president/fallback_scalar_aav_1024_uf1_falp.csv create mode 100644 publication/tmp/president/fallback_scalar_aav_1024_uf1_falp.metadata create mode 100644 publication/tmp/president/x86_64_avx512bw_intrinsic_1024_uf1_falp.csv create mode 100644 publication/tmp/president/x86_64_avx512bw_intrinsic_1024_uf1_falp.metadata create mode 100644 publication/zstd_compression_ratio.csv create mode 100644 scripts/run-clang-format.py create mode 100644 src/CMakeLists.txt create mode 100644 src/falp.cpp create mode 100644 src/fastlanes_ffor.cpp create mode 100644 src/fastlanes_generated_ffor.cpp create mode 100644 src/fastlanes_generated_unffor.cpp create mode 100644 src/fastlanes_unffor.cpp create mode 100644 test/CMakeLists.txt create mode 100644 test/include/test/mapper.hpp create mode 100644 test/test_alp_sample.cpp create mode 100644 toolchain/example.cmake create mode 100644 toolchain/m1.cmake diff --git a/.clang-format b/.clang-format new file mode 100644 index 0000000..faae7b6 --- /dev/null +++ b/.clang-format @@ -0,0 +1,36 @@ +BasedOnStyle: LLVM +TabWidth: 4 +IndentWidth: 4 +UseTab: ForIndentation +DerivePointerAlignment: false +PointerAlignment: Left +AlignConsecutiveMacros: AcrossEmptyLinesAndComments +AlignAfterOpenBracket: Align +AlignTrailingComments: true +AlignConsecutiveDeclarations: Consecutive +AlignConsecutiveAssignments: Consecutive +AllowAllArgumentsOnNextLine: true +AllowAllConstructorInitializersOnNextLine: true +AllowAllParametersOfDeclarationOnNextLine: true +SpaceBeforeCpp11BracedList: true +SpaceBeforeCtorInitializerColon: true +SpaceBeforeInheritanceColon: true +SpacesInAngles: false +SpacesInCStyleCastParentheses: false +SpacesInConditionalStatement: false +AllowShortLambdasOnASingleLine: Inline +AllowShortLoopsOnASingleLine: false +AlwaysBreakTemplateDeclarations: Yes +ColumnLimit: 120 +IncludeBlocks: Merge +SortIncludes: CaseSensitive +Language: Cpp +AccessModifierOffset: -4 +BreakConstructorInitializers: BeforeComma +AllowShortBlocksOnASingleLine: Always +AllowShortFunctionsOnASingleLine: All +AllowShortIfStatementsOnASingleLine: true +CompactNamespaces: true +BinPackArguments: false +BinPackParameters: false + diff --git a/.clang-tidy b/.clang-tidy new file mode 100644 index 0000000..66bc81b --- /dev/null +++ b/.clang-tidy @@ -0,0 +1,101 @@ +Checks: '-*, +clang-diagnostic-*, +performance-*, +performance-inefficient-vector-operation, +cert-*, +portability-restrict-system-includes, +modernize-deprecated-ios-base-aliases, +modernize-loop-convert, +modernize-make-shared, +modernize-make-unique, +modernize-pass-by-value, +modernize-raw-string-literal, +modernize-redundant-void-arg, +modernize-replace-auto-ptr, +modernize-replace-disallow-copy-and-assign-macro, +modernize-return-braced-init-list, +modernize-use-auto, +modernize-use-bool-literals, +modernize-use-equals-default, +modernize-use-emplace, +modernize-use-equals-delete, +modernize-use-noexcept, +modernize-use-nullptr, +modernize-use-override, +modernize-use-using, +google-explicit-constructor, +google-build-using-namespace, +google-runtime-int, +misc-definitions-in-headers, +-bugprone-macro-parentheses, +readability-braces-around-statements, +-bugprone-branch-clone, +readability-identifier-naming, +hicpp-exception-baseclass, +misc-throw-by-value-catch-by-reference, +-bugprone-signed-char-misuse, +-bugprone-misplaced-widening-cast, +-bugprone-sizeof-expression, +-bugprone-narrowing-conversions, +google-global-names-in-headers, +llvm-header-guard, +misc-definitions-in-headers, +readability-container-size-empty' +WarningsAsErrors: '*' +AnalyzeTemporaryDtors: false +FormatStyle: none + +ExtraArgs: + # clang-tidy 17 started to complain (for unknown reasons) that various pragmas are unknown ("clang-diagnostic-unknown-pragmas"). + # This is technically a compiler error, not a clang-tidy error. We could litter the code base with more pragmas that suppress + # this error but it is better to pass the following flag to the compiler: + - '-Wno-unknown-pragmas' + - '-Wno-unused-command-line-argument' # similar issue + +CheckOptions: + - key: readability-identifier-naming.ClassCase + value: aNy_CasE + - key: readability-identifier-naming.EnumCase + value: CamelCase + - key: readability-identifier-naming.TypedefCase + value: lower_case + - key: readability-identifier-naming.TypedefSuffix + value: _t + - key: readability-identifier-naming.FunctionCase + value: aNy_CasE + - key: readability-identifier-naming.MemberCase + value: lower_case + - key: readability-identifier-naming.ParameterCase + value: lower_case + - key: readability-identifier-naming.ConstantCase + value: aNy_CasE + - key: readability-identifier-naming.ConstantParameterCase + value: lower_case + - key: readability-identifier-naming.NamespaceCase + value: lower_case + - key: readability-identifier-naming.MacroDefinitionCase + value: UPPER_CASE + - key: readability-identifier-naming.StaticConstantCase + value: UPPER_CASE + - key: readability-identifier-naming.ConstantMemberCase + value: aNy_CasE + - key: readability-identifier-naming.StaticVariableCase + value: UPPER_CASE + - key: readability-identifier-naming.ClassConstantCase + value: UPPER_CASE + - key: readability-identifier-naming.EnumConstantCase + value: UPPER_CASE + - key: readability-identifier-naming.ConstexprVariableCase + value: UPPER_CASE + - key: readability-identifier-naming.StaticConstantCase + value: UPPER_CASE + - key: readability-identifier-naming.TemplateTemplateParameterCase + value: UPPER_CASE + - key: readability-identifier-naming.TypeTemplateParameterCase + value: UPPER_CASE + - key: readability-identifier-naming.VariableCase + value: lower_case + - key: readability-identifier-naming.PrivateMemberPrefix + value: "m_" + - key: readability-identifier-naming.PrivateMethodCase + value: lower_case diff --git a/.github/workflows/CI.yaml b/.github/workflows/CI.yaml new file mode 100644 index 0000000..ef73344 --- /dev/null +++ b/.github/workflows/CI.yaml @@ -0,0 +1,210 @@ +name: CI +run-name: ${{ github.actor }} is building + +on: push + +jobs: + check-format: + runs-on: [ self-hosted ] + steps: + - name: echo build + run: echo "todo" + + build: + needs: + - check-format + if: github.actor == 'azimafroozeh' + strategy: + fail-fast: true + matrix: + platform: [ self-hosted, m1 , avx512 ] + BUILD_TYPE: [ Debug, Release ] + cxx: [ clang++ ] + runs-on: ${{ matrix.platform }} + + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-node@v4 + + - name: Make directory build + run: mkdir ${{github.workspace}}/build + + - name: Configure CMake + run: cmake -S ${{github.workspace}} -B ${{github.workspace}}/build + env: + CXX: ${{ matrix.cxx }} + + - name: Build + run: cmake --build ${{github.workspace}}/build -j 16 + + test: + needs: + - build + if: github.actor == 'azimafroozeh' + strategy: + fail-fast: true + matrix: + platform: [ self-hosted, m1, avx512 ] + BUILD_TYPE: [ Debug, Release ] + cxx: [ clang++ ] + runs-on: ${{ matrix.platform }} + + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-node@v4 + + - name: Make directory build + run: mkdir ${{github.workspace}}/build + + - name: Configure CMake + run: cmake -DALP_BUILD_TESTING=ON -S ${{github.workspace}} -B ${{github.workspace}}/build + env: + CXX: ${{ matrix.cxx }} + + - name: Build + run: cmake --build ${{github.workspace}}/build -j 16 + + - name: Test + working-directory: ${{github.workspace}}/build + run: ctest -j 4 --rerun-failed --output-on-failure + + example: + needs: + - test + if: github.actor == 'azimafroozeh' + strategy: + fail-fast: true + matrix: + platform: [ self-hosted, m1 ] + BUILD_TYPE: [ Release ] + cc: [ clang ] + cxx: [ clang++ ] + runs-on: ${{ matrix.platform }} + + + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-node@v4 + + - name: mkdir build + run: mkdir ${{github.workspace}}/build + + - name: Configure CMake + run: cmake -DALP_BUILD_EXAMPLE=ON -S ${{github.workspace}} -B ${{github.workspace}}/build + env: + CXX: ${{ matrix.cxx }} + + - name: Build + run: cmake --build ${{github.workspace}}/build -j 16 + + - name: run simple_compress + run: ${{github.workspace}}/build/example/simple_compress + + - name: run rd_compress + run: ${{github.workspace}}/build/example/rd_compress + + - name: run adaptive_compress + run: ${{github.workspace}}/build/example/adaptive_compress + + benchmark: + needs: + - test + if: github.actor == 'azimafroozeh' + strategy: + fail-fast: true + matrix: + platform: [ m1 ] + BUILD_TYPE: [ Release ] + cc: [ clang ] + cxx: [ clang++ ] + runs-on: ${{ matrix.platform }} + + + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-node@v4 + + - name: mkdir build + run: mkdir ${{github.workspace}}/build + + - name: Configure CMake + run: cmake -DALP_BUILD_BENCHMARKING=ON -DALP_BUILD_TESTING=ON -DCMAKE_TOOLCHAIN_FILE=toolchain/m1.cmake -S ${{github.workspace}} -B ${{github.workspace}}/build + env: + CXX: ${{ matrix.cxx }} + + - name: Build + run: cmake --build ${{github.workspace}}/build -j 16 + + - name: Test + working-directory: ${{github.workspace}}/build + run: ctest -j 4 + + # - name: run bench_alp_cutter_decode + # run: ${{github.workspace}}/build/benchmarks/bench/bench_alp_cutter_decode + # + # - name: run bench_alp_cutter_encode + # run: ${{github.workspace}}/build/benchmarks/bench/bench_alp_cutter_encode + # + # - name: run bench_alp_encode + # run: ${{github.workspace}}/build/benchmarks/bench/bench_alp_encode + # + # - name: run bench_alp_without_sampling + # run: ${{github.workspace}}/build/benchmarks/bench/bench_alp_without_sampling + # + # - name: run bench_chimp + # run: ${{github.workspace}}/build/benchmarks/bench/bench_chimp + # + # - name: run bench_chimp128 + # run: ${{github.workspace}}/build/benchmarks/bench/bench_chimp128 + # + # - name: run bench_gorillas + # run: ${{github.workspace}}/build/benchmarks/bench/bench_gorillas + # + # - name: run bench_patas + # run: ${{github.workspace}}/build/benchmarks/bench/bench_patas + # + # - name: run bench_zstd + # run: ${{github.workspace}}/build/benchmarks/bench/bench_zstd + + full_dataset: + needs: + - test + if: github.actor == 'azimafroozeh' + strategy: + fail-fast: true + matrix: + platform: [ m1 ] + BUILD_TYPE: [ Release ] + cc: [ clang ] + cxx: [ clang++ ] + runs-on: ${{ matrix.platform }} + + + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-node@v4 + + - name: mkdir build + run: mkdir ${{github.workspace}}/build + + - name: Configure CMake + run: cmake -DALP_BUILD_BENCHMARKING=ON -DALP_BUILD_TESTING=ON -DALP_BUILD_BENCHMARKING_COMPRESSION_RATIO=ON -S ${{github.workspace}} -B ${{github.workspace}}/build + env: + CXX: ${{ matrix.cxx }} + + - name: Build + run: cmake --build ${{github.workspace}}/build -j 16 + + - name: run alp + run: ${{github.workspace}}/build/benchmarks/bench_compression_ratio/bench_alp_compression_ratio + + - name: run alp32 + run: ${{github.workspace}}/build/benchmarks/bench_compression_ratio/bench_alp32_compression_ratio + + - name: run zstd + run: ${{github.workspace}}/build/benchmarks/bench_compression_ratio/bench_zstd_compression_ratio + + + + diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..98dc6ea --- /dev/null +++ b/.gitignore @@ -0,0 +1,124 @@ +thirdparty/*.tar* +CMakeFiles/ +CMakeCache.txt +CTestTestfile.cmake +Makefile +!include/zstd/Makefile +cmake_install.cmake +build/ +*-build/ +Testing/ +build-support/boost_* +raw_binaries/ +test_*.cmake +*_tests.cmake +*_include.cmake +bench_*.cmake +bench_*.hpp +libzstd.pc + +bench_alp_cutter_decode +bench_alp_cutter_encode +bench_alp_encode +bench_alp_without_sampling +bench_chimp +bench_chimp128 +bench_gorillas +bench_patas +bench_zstd +bench_zstd_compression_ratio +bench_alp_compression_ratio +bench_alp32_compression_ratio +bench_chimp_compression_ratio +bench_chimp128_compression_ratio +bench_gorillas_compression_ratio +bench_patas_compression_ratio + +test_alp_vectorized +test_alp_sample +test_alp32_vectorized +test_chimp +test_chimp128 +test_gorillas +test_patas +test_zstd + +arm64v8_neon_intrinsic_1024_uf1_falp_test +arm64v8_sve_intrinsic_1024_uf1_falp_test +fallback_scalar_aav_1024_uf1_falp_test +fallback_scalar_nav_1024_uf1_falp_test + +simple_compress +simple_compress32 +rd_compress +rd_compress32 +adaptive_compress +benchmarks/fcbench/fcbench + +DartConfiguration.tcl + +.idea/ +.cmake/ +_deps/ + +# Build directories created by Clion +cmake-build-*/ + +######################################### +# Editor temporary/working/backup files # +.#* +*\#*\# +[#]*# +*~ +*$ +*.bak +*flymake* +*.kdev4 +*.log +*.swp + +######################################### +# ignore tmp directory +tmp + +######################################### from the fls_benchamrks integration +.vagrant/ +compile_commands.json +*.pdf +plot/*.log +plot/*.tex +.Rhistory + +# Prerequisites +*.d + +# Compiled Object files +*.slo +*.lo +*.o +*.obj + +# Precompiled Headers +*.gch +*.pch + +# Compiled Dynamic libraries +*.so +*.dylib +*.dll + +# Fortran module files +*.mod +*.smod + +# Compiled Static libraries +*.lai +*.la +*.a +*.lib + +# Executables +*.exe +*.out +*.app + diff --git a/BENCHMARKING.md b/BENCHMARKING.md new file mode 100644 index 0000000..550d3a4 --- /dev/null +++ b/BENCHMARKING.md @@ -0,0 +1,136 @@ +# Benchmarking + +Here we explain how to replicate the experiments presented in our [publication](https://dl.acm.org/doi/pdf/10.1145/3626717) and how to benchmark with your own data. + +On the benchmarked datasets from our publication: +- ALP achieves x3 compression ratios on average (sometimes much much higher). +- ALP encodes 0.5 values per CPU cycle. +- ALP decodes 2.6 values per CPU cycle. + +On [FCBench](https://www.vldb.org/pvldb/vol17/p1418-tao.pdf): +- ALP achieves a compression ratio of 2.08 (beating all other compressors) + +## Contents +- [Contents](#contents) +- [Build](#build) +- [Downloading Data](#downloading-data) + - [Environment Variables](#environment-variables) + - [Setup Data](#setup-data) +- [Compression Ratios Experiment](#compression-ratios-experiment) + - [ALP](#alp) + - [Chimp, Chimp128, Gorillas, Patas, Zstd Compression Ratios](#chimp-chimp128-gorillas-patas-zstd-compression-ratios) +- [Speed Tests](#speed-tests) + - [ALP Encoding Speed Test](#alp-encoding-speed-test) + - [ALP Decoding Speed Test](#alp-decoding-speed-test) + - [ALP RD Encoding Speed Test](#alp-rd-encoding-speed-test) + - [ALP RD Decoding Speed Test](#alp-rd-decoding-speed-test) + - [Chimp, Chimp128, Gorillas, Patas, Zstd Speed Test](#chimp-chimp128-gorillas-patas-zstd-speed-test) + - [PseudoDecimals Speed Test](#pseudodecimals-speed-test) + - [ELF Speed Test](#elf-speed-test) +- [FCBench](#fcbench) + + +## Build + +```shell +cmake [OPTIONS] . +make +``` + +Options: +- `-DALP_BUILD_EXAMPLE=ON`: Build the examples in `/example` +- `-DALP_BUILD_TESTING=ON`: Build ALP correctness tests in `/test` +- `-DALP_BUILD_BENCHMARKING=ON`: Build speed and compression ratio benchmarks in `/benchmarks` +- `-DALP_BUILD_GENERATED=ON`: Build FastLanes generated code in `/generated` + +You can also set these options directly inside the `CMakeLists.txt` + +## Downloading Data + +You can download the datasets shown in our publication [here](https://drive.google.com/drive/folders/167faTwZJjqJMKM9Yc6E7KF5LUbsitxJS?usp=sharing) (`complete_binaries.zip`). They are in a binary format (64 bit doubles one after another). These are the files we used to benchmark ALP compression ratios in the [publication](https://dl.acm.org/doi/pdf/10.1145/3626717). + +In addition to this, inside `data/datasets_transformer.ipynb` you can find a [Jupyter Notebook script](/data/datasets_transformer.ipynb) with guidelines to download the datasets from their original source and code to transform them to a binary format (64 bits doubles). Note that some of these require a heavy pre-processing phase. + +### Environment Variables +Set the environment variable `ALP_DATASET_DIR_PATH` with the path to the directory in which the complete binary datasets are located; either on your env or manually on the [column.hpp](/data/include/column.hpp) file. + +### Setup Data +Inside `data/include/double_columns.hpp` you can find an array containing information regarding the datasets used to benchmark ALP. Datasets information includes a path to a sample of one vector (1024 values) in CSV format (inside `/data/samples/`) and a path to the entire file in binary format. + +The binary file is used to benchmark ALP compression ratios, while the CSV sample is used to benchmark ALP speed. To ensure the correctness of the speed tests we also keep extra variables from each dataset, which include the number of exceptions and the bitwidth resulting after compression (unless the algorithm changes, these should remain consistent), and the factor/exponent indexes used to encode/decode the doubles into integers. + +To set up the data you want to run the test on, add or remove entries in the array found in [double_columns.hpp](/data/include/double_columns.hpp) and `make` again. The data needed for each entry is detailed in [column.hpp](/data/include/column.hpp). To replicate the compression ratio tests you only need to set the dataset id, name, and binary_file_path. + +## Compression Ratios Experiment + +### ALP +After building and setting up the data, and the `ALP_DATASET_DIR_PATH` env variable, run the following: +```sh +./benchmarks/bench_compression_ratio/bench_alp_compression_ratio +``` +This will execute the tests found in the [/benchmarks/bench_compression_ratio/alp.cpp](/benchmarks/bench_compression_ratio/alp.cpp) file, which will compress an entire binary file and write the resulting (estimated) compression ratio results (in bits/value) from the datasets in [double_columns.hpp](/data/include/double_columns.hpp), on the `publication` directory. One CSV file will be created for the datasets which use the `ALP` scheme and another one for the ones which use the `ALP_RD` scheme. Note that this is a dry compression (compressed data is not stored). + +### Chimp, Chimp128, Gorillas, Patas, Zstd Compression Ratios +After building and setting up the data, and the `ALP_DATASET_DIR_PATH` env variable, run the following: `./benchmarks/bench_compression_ratio/bench_{algorithm}_compression_ratio`, in which `algorithm` can be: `chimp|chimp128|gorillas|patas|zstd`. One CSV file will be created for each encoding and for each dataset on the `publication` directory. Note that this is a dry compression (compressed data is not stored). For PDE and ELF, we used their own code for compression ratios. + +## Speed Tests + +All of these tests read the CSV samples files locations from the dataset array. Therefore, to test with your own data, add your dataset to this array. Note that these experiments are performed on 1024 values. Why? Check Section 4 of the [publication](https://dl.acm.org/doi/pdf/10.1145/3626717). + +### ALP Encoding Speed Test +Encoding is comprised of the `encode`, `analyze_ffor`, and `ffor` primitives. Benchmarked by running: `./benchmarks/bench_speed/bench_alp_encode`. Results are located on `publication/results/`. + +### ALP Decoding Speed Test +Fused decoding is comprised of the `falp` and the `patch_exceptions` primitives. Unfused decoding is comprised of the `unffor`, `decode` and `patch_exceptions` primitives. Benchmark both fused and unfused at the same time on different implementations and Architectures/ISAs by running the commands below. Results are located on `publication/results/`. + +| Implementation | Command | +|-----------------|------------------------------------------------------------------------------------------------------------| +| Scalar | `./generated/fallback/scalar_nav_uf1/fallback_scalar_nav_1024_uf1_falp_bench` | +| SIMD | `./generated/{Arch}/{Arch}_{extension}_intrinsic_uf1/{Arch}_{extension}_intrinsic_1024_uf1_falp_bench` | +| Auto-Vectorized | `./generated/fallback/scalar_aav_uf1/fallback_scalar_aav_1024_uf1_falp_bench` | + +While the *correctness* can be tested by running: + +| Implementation | Command | +|-----------------|-----------------------------------------------------------------------------------------------------------| +| Scalar | `./generated/fallback/scalar_nav_uf1/fallback_scalar_nav_1024_uf1_falp_test` | +| SIMD | `./generated/{Arch}/{Arch}_{extension}_intrinsic_uf1/{Arch}_{extension}_intrinsic_1024_uf1_falp_test` | +| Auto-Vectorized | `./generated/fallback/scalar_aav_uf1/fallback_scalar_aav_1024_uf1_falp_test` | + +The source file of the `falp` primitive (FUSED ALP+FOR+Bitpack generated by [FastLanes](https://github.com/cwida/FastLanes)) for each different implementation are at: + +| Implementation | Source File | +|-----------------|------------------------------------------------------------------------------------------------------------| +| Scalar | `generated/fallback/scalar_nav_uf1/fallback_scalar_nav_1024_uf1_falp_src.cpp` | +| SIMD | `generated/{Arch}/{Arch}_{extension}_intrinsic_uf1/{Arch}_{extension}_intrinsic_1024_uf1_falp_src.cpp` | +| Auto-Vectorized | `generated/fallback/scalar_aav_uf1/fallback_scalar_aav_1024_uf1_falp_src.cpp` | + + +Architectures and ISAs: + +| Architecture {Arch} | ISA {extension} | +|--------------|---------| +| arm64v8 | neon | +| arm64v8 | sve | +| wasm | simd128 | +| x86_64 | sse | +| x86_64 | avx2 | +| x86_64 | avx512bw| + +### ALP RD Encoding Speed Test +Encoding is comprised of `rd_encode` and two calls to `ffor` (for both the left and right parts). Benchmarked by running: `./benchmarks/bench_speed/bench_alp_cutter_encode`. Results are located on `publication/results/`. + +### ALP RD Decoding Speed Test +Decoding is comprised of two calls to `unffor` (for both the left and right parts) and the `rd_decode` primitives. Benchmarked by running: `./benchmarks/bench_speed/bench_alp_cutter_decode`. Results are located on `publication/results/`. + +### Chimp, Chimp128, Gorillas, Patas, Zstd Speed Test +Benchmarked both decoding and encoding by running `./benchmarks/bench_speed/bench_{algorithm}`, in which `algorithm` can be: `chimp|chimp128|gorillas|patas|zstd`. Results are located on `publication/results/i4i`. + +### PseudoDecimals Speed Test +We benchmarked PseudoDecimals within BtrBlocks. Results are located on `publication/results/i4i`. + +### ELF Speed Test +We benchmarked Elf using their Java implementation. + +## FCBench +We have benchmarked ALP compression ratios on the datasets presented on [FCBench](https://www.vldb.org/pvldb/vol17/p1418-tao.pdf). ALP comes on top with an average **compression ratio of 2.08** compared to the best compressor in the benchmark (Bitshuffle + Zstd with 1.47). ALP is superior even despite the benchmark doing horizontal compression instead of columnar compression (i.e. values from multiple columns in a table are compressed together). \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..57eedec --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,78 @@ +cmake_minimum_required(VERSION 3.22) + +set(CMAKE_CXX_STANDARD 17) +project(ALP) + +add_compile_options(-fPIC) + +# Options : ------------------------------------------------------------------------------------------------------------ +option(ALP_BUILD_EXAMPLE "Build Example" OFF) +option(ALP_BUILD_TESTING "Build Test" OFF) +option(ALP_BUILD_BENCHMARKING "Build Benchmark" OFF) +option(ALP_BUILD_BENCHMARKING_COMPRESSION_RATIO "Build Benchmarking compression ratio" OFF) +option(ALP_BUILD_GENERATED "Build generated ALP" OFF) + +#----------------------------------------------------------------------------------------------------------------------- +include(FetchContent) +include(CheckCXXCompilerFlag) +include(CMakePrintHelpers) +include(CTest) + +# CMAKE_SOURCE_DIR: ---------------------------------------------------------------------------------------------------- +add_compile_definitions(CMAKE_SOURCE_DIR="${CMAKE_SOURCE_DIR}") + +# Include -------------------------------------------------------------------------------------------------------------- +include_directories(include) + +# Src: ------------------------------------------------------------------------------------------------------- +add_subdirectory(src) + +# Example : ------------------------------------------------------------------------------------------------------------ +if (ALP_BUILD_EXAMPLE) + message("---------------------------------------------------------------------------------------------------------") + message("- Example:") + include_directories(${CMAKE_SOURCE_DIR}/data/include) + include_directories(${CMAKE_SOURCE_DIR}/example/include) + + add_subdirectory(${CMAKE_SOURCE_DIR}/example) +endif () + +# Test : --------------------------------------------------------------------------------------------------------------- +if (ALP_BUILD_TESTING) + message("---------------------------------------------------------------------------------------------------------") + message("- Testing:") + + # Gtest: ----------------------------------------------------------------------------------------------------------- + include(GoogleTest) + FetchContent_Declare(googletest + GIT_REPOSITORY https://github.com/google/googletest.git + GIT_TAG e2239ee6043f73722e7aa812a459f54a28552929 # release-1.11.0 + ) + # For Windows: Prevent overriding the parent project's compiler/linker settings + set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) + FetchContent_MakeAvailable(googletest) + + enable_testing() + + include_directories(${CMAKE_SOURCE_DIR}/test/include) + include_directories(${CMAKE_SOURCE_DIR}/data/include) + add_subdirectory(${CMAKE_SOURCE_DIR}/test) +endif () + +# Generated : ---------------------------------------------------------------------------------------------------------- +if (ALP_BUILD_GENERATED) + message("---------------------------------------------------------------------------------------------------------") + message("- Generated:") + + add_subdirectory(generated) +endif () + +# Benchmark : ---------------------------------------------------------------------------------------------------------- +if (ALP_BUILD_BENCHMARKING) + message("---------------------------------------------------------------------------------------------------------") + message("- Benchmarking:") + + include_directories(${CMAKE_SOURCE_DIR}/benchmarks/include) + include_directories(${CMAKE_SOURCE_DIR}/data/include) + add_subdirectory(benchmarks) +endif () \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..46301aa --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2023 CWI, Azim Afroozeh, Leonardo Xavier Kuffo Rivero + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/PRIMITIVES.md b/PRIMITIVES.md new file mode 100644 index 0000000..04b64af --- /dev/null +++ b/PRIMITIVES.md @@ -0,0 +1,144 @@ +# ALP Primitives + +You can make your own [de]compression API by using ALP primitives. An example of the usage of these can be found in our simple [compression](/include/alp/compressor.hpp) and [decompression](/include/alp/decompressor.hpp) API. The decoding primitives of ALP are auto-vectorized thanks to [FastLanes](https://github.com/cwida/FastLanes). For **benchmarking** purposes, we recommend you use these primitives. + +You can use these by including our library in your code: `#include "alp.hpp"` and accessing through the `alp` namespace. + +## ALP + +### AlpEncode::init +```c++ +init(double|float* data_column, + size_t column_offset, + size_t tuples_count, + double|float* sample_arr, + alp::state& stt) +``` +Initializes the algorithm by performing the first-level sampling on the `data_column` buffer and deciding the adequate scheme to use (by default `ALP`). The sampling is performed from the `column_offset` index until `column_offset + ALP_ROUWGROUP_SIZE`. + +### AlpEncode::encode +```c++ +encode( double|float* input_vector, + double|float* exceptions, + uint16_t* exceptions_positions, + uint16_t* exceptions_count, + int64_t* encoded_integers, // Encoded integers are always int64 + alp::state& stt) +``` +Uses `ALP` to encode the values in `input_vector` into integers using the state (`stt`) `factor` and `exponent`. Encoded values are stored in `encoded_integers` alongside their `exceptions`, their `exceptions_positions` and the `exceptions_count`. Input vector is assumed to point to `ALP_VECTOR_SIZE` (1024) elements. Here, the second-level sampling is performed if necessary. + +### ffor::ffor +```c++ +ffor(int64_t|int32_t* in, + int64_t|int32_t* out, + uint8_t bit_width, + int64_t|int32_t* ffor_base) +``` +Encode `in` using FFOR (FOR + BP) and writing to `out`. `in` is assumed to point to `ALP_VECTOR_SIZE` (1024) elements. The target `bit_width` and the frame of reference (`ffor_base`) must be given. `alp::analyze_ffor()` primitive can be used to obtain both from an array of integers. + + +### AlpEncode::analyze_ffor +```c++ +analyze_ffor(int64_t* input_vector, + uint8_t& bit_width, + int64_t* ffor_base) +``` +Reads values in `input_vector` and set the proper `bit_width` and frame of reference (`ffor_base`) to FOR+bitpack the array. + + +### AlpDecode::decode +```c++ +decode(uint64_t* encoded_integers, + uint8_t fac_idx, + uint8_t exp_idx, + double|float* output) +``` +Uses `ALP` to decode the values in `encoded_integers` into `output` using `factor` and `exponent` for the decoding multiplication. The size of the encoded integers array and the output buffer are assumed to be `ALP_VECTOR_SIZE` (1024). + +### ffor::unffor +```c++ +unffor(int64_t|int32_t* in, + int64_t|int32_t* out, + uint8_t bit_width, + int64_t|int32_t* ffor_base) +``` +Decode `in` by reversing the FFOR (FOR + BP) and writing to `out`. `in` is assumed to point to `ALP_VECTOR_SIZE` (1024) elements. The target `bit_width` and the frame of reference (`ffor_base`) must be given. + +### generated::falp::fallback::scalar::falp +```c++ +falp(uint64_t* in, + double* out, + uint8_t bit_width, + uint64_t* ffor_base, + uint8_t factor, + uint8_t exponent) +``` +CURRENTLY ONLY AVAILABLE FOR `double` + +Fused implementation of `decode` and `unffor`. Decode `in` with ALP, reverse the FFOR (FOR + BP) and write to `out`. `in` is assumed to point to `ALP_VECTOR_SIZE` (1024) elements. The target `bit_width`, the frame of reference (`ffor_base`), and the encoding `factor` and `exponent` indexes must be given. + +### AlpDecode::patch_exceptions +```c++ +patch_exceptions(double|float* output, + double|float* exceptions, + uint16_t* exceptions_positions, + uint16_t* exceptions_count) +``` +Patch the exceptions in `output` using their positions and respective count. + + +## ALP RD +### AlpRD::init +```c++ +init(double|float* data_column, + size_t column_offset, + size_t tuples_count, + double|float* sample_arr, + alp::state& stt) +``` +Initializes the algorithm by performing the first-level sampling on the `data_column` buffer. The sampling is performed from the `column_offset` index until `column_offset + ALP_ROUWGROUP_SIZE`. Afterwards, the best position to cut the floating-point values is found and the dictionary to encode the left parts is built and stored in `stt.left_parts_dict`. + +### AlpRD::encode +```c++ +encode(double|float* input_vector, + uint16_t* exceptions, + uint16_t* exception_positions, + uint16_t* exceptions_count, + uint64_t|uint32_t* right_parts, + uint16_t* left_parts, + alp::state& stt) +``` +Uses `ALPRD` to encode the values in `input_vector` into their left and right parts alongside their `exceptions`, their `exceptions_positions` and the `exceptions_count`. Input vector is assumed to point to `ALP_VECTOR_SIZE` (1024) elements. Here, the second-level sampling is performed if necessary. + +### AlpRD::decode +```c++ +decode(double|float* a_out, + uint64_t|uint32_t* unffor_right_arr, + uint16_t* unffor_left_arr, + uint16_t* exceptions, + uint16_t* exceptions_positions, + uint16_t* exceptions_count, + state& stt) +``` +Uses `ALP_RD` to decode the values in `unffor_right_arr` and `unffor_left_arr` by glueing them. The size of the encoded integers array and the output buffer are assumed to be `ALP_VECTOR_SIZE` (1024). Exception patching is fused in this function. + +## Using the Primitives + +### Rowgroup Level +`init` are primitives that should be called per rowgroup. They set the necessary `state` that other primitives need. All other primitives should be called per vector (1024 values). + +### ALP +Encoding is comprised of the `encode`, `analyze_ffor`, and `ffor` primitives. + +Fused decoding is comprised of the `falp` and the `patch_exceptions` primitives. Unfused decoding is comprised of the `unffor`, `decode` and `patch_exceptions` primitives. + + +### ALP RD +Encoding is comprised of `encode` and two calls to `ffor` (for both the left and right parts). + +Decoding is comprised of two calls to `unffor` (for both the left and right parts) and the `decode` primitives. + +### Last Vector Encoding +ALP primitives operate on blocks of 1024 values (to easily auto-vectorize). As such, the last vector of a dataset may be incomplete (`vector_size != ALP_VECTOR_SIZE`). A few strategies can be implemented to encode an incomplete vector: +- Fill the missing values of the vector with the first value or with `0.0`. Pros: Easy to implement and efficient. Cons: Value may be an exception; `0.0` can negatively affect the bitpacking size +- Fill the vector with the first non-exception value after encoding (implemented in our example Compression API). Pros: This will yield the best compression ratio for the last vector. Cons: The vector must be encoded twice. \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..4e244e6 --- /dev/null +++ b/README.md @@ -0,0 +1,97 @@ +# ALP: Adaptive Lossless Floating-Point Compression +Lossless floating-point compression algorithm for `double`/`float` data type. ALP significantly improves over all previous floating-point encodings in both speed and compression ratio (figure below; each dot represents a dataset). + +

+ ALP Benchmarks +

+ +- ⚡ **High Speed**: Scans 44x faster than Gorilla, 64x faster than Chimp, 31x faster than Zstd. Compresses 11x faster than Zstd, 138x faster than PDE and x10 faster than Chimp. +- ✅ **High Compression**: 50% more compression than Gorillas. 24% more than Chimp128. On par with Zstd level 3. +- ✅ **Adaps to data**: By using a two-stage algorithm that first samples row-groups and then vectors. +- ✅ **Scalar code**: Auto-vectorizes thanks to [FastLanes](https://github.com/cwida/FastLanes). +- ✅ **Lightweight Encoding**: Compression and decompression occurs in blocks of 1024 values. Ideal for columnar databases. +- ✅ **Proven Effectiveness**: Effectiveness and speed led to deprecating Chimp128 and Patas in DuckDB. +- ✅ **Works on difficult floats**: Can losslessly compress even floats present as ML models parameters better than Zstd and all other encodings. + + +To *rigorously* benchmark ALP with your own data we provide our [ALP primitives](#alp-primitives) as a single C++ header file. + +To *quickly* test ALP we recommend following [our examples](#quickstart) in the Quickstart guide or [using it on DuckDB](#alp-in-duckdb) (note that ALP inside DuckDB is slower than using our primitives). + +ALP details can be found in the [publication](https://dl.acm.org/doi/pdf/10.1145/3626717). + +### Used By +

+ DuckDB +

+ +### Contents +- [ALP in a Nutshell](#alp-in-a-nutshell) +- [Quickstart](#quickstart) +- [Building and Running](#building-and-running) +- [ALP Primitives](#alp-primitives) +- [ALP in DuckDB](#alp-in-duckdb) +- [Benchmarking (Replicating Paper Experiments)](#benchmarking-replicating-paper-experiments) + +## ALP in a Nutshell +ALP has two compression schemes: `ALP` for doubles/floats which were once decimals, and `ALP_RD` for true double/floats (e.g. the ones which stem from many calculations, scientific data, ML weights). + + +`ALP` losslessly transforms doubles/floats to integer values with two multiplications to FOR+BitPack them into only the necessary bits. This is a strongly enhanced version of [PseudoDecimals](https://dl.acm.org/doi/abs/10.1145/3589263). + +`ALP_RD` splits the doubles/floats bitwise representations into two parts (left and right). The left part is encoded with a Dictionary compression and the right part is Bitpacked to just the necessary bits. + +Both encodings operate in vectors of 1024 values at a time (fit *vectorized execution*) and leverage in-vector commonalities to achieve higher compression ratios and be faster (by avoiding per-value adaptivity) than other methods. + +Both encodings encode outliers as *exceptions* to achieve higher compression ratios. + +## Quickstart + +[Usage examples](/example/) are available under the `example` directory. In here, we use a simple [de]compression API to store/read ALP data in/from memory. +- [Simple compress](/example/simple_compress.cpp): An example to compress a buffer of random doubles with limited decimal precision. Also available for [32bit single precision](/example/simple_compress32.cpp) +- [RD Compress](/example/rd_compress.cpp): An example to directly compress using `ALP_RD` scheme if the data are true doubles. Also available for [32bit single precision](/example/rd_compress32.cpp) +- [Adaptive Compress](/example/adaptive_compress.cpp): An example in which half of the data is of limited decimal precision and half of the data are true doubles. + +Note that the [de]compression API used by these examples is only a naive wrapper of the real ALP core: [the primitives](#alp-primitives). + +## Building and Running +Requirements: +1) __Clang++__ +2) __CMake__ 3.20 or higher + +Building and running the [simple compress](/example/simple_compress.cpp) example: +```sh +cmake -DALP_BUILD_EXAMPLE=ON . # or set option in the CMakeLists.txt +cd example +make +./simple_compress +``` + +This will also generate the ALP Primitives. + + +## ALP Primitives +You can make your own [de]compression API by using ALP primitives. An example of the usage of these can be found in our simple [compression](/include/alp/compressor.hpp) and [decompression](/include/alp/decompressor.hpp) API. The decoding primitives of ALP are auto-vectorized thanks to [FastLanes](https://github.com/cwida/FastLanes). For **benchmarking** purposes, we recommend you use these primitives. + +You can use these by including our library in your code: `#include "alp.hpp"`. + +Check the full documentation of these on the [PRIMITIVES.MD](/PRIMITIVES.md) readme. + +## ALP in DuckDB +ALP replaced Chimp128 and Patas in [DuckDB](https://github.com/duckdb/duckdb/pull/9635). In DuckDB, ALP is **x2-4 times faster** than Patas (at decompression) achieving **twice as high compression ratios** (sometimes even much more). DuckDB can be used to quickly test ALP on custom data, however, we advise against doing so if your purpose is to rigorously benchmark ALP against other algorithms. + +[Here](https://github.com/duckdb/duckdb/blob/main/benchmark/micro/compression/alp/alp_read.benchmark) you can find a basic example on how to load data in DuckDB forcing ALP to be used as compression method. These statements can be called using the Python API. + +**Please note**: ALP inside DuckDB: i) Is slower than using our primitives presented here, and ii) compression ratios can be slightly worse due to the metadata needed to skip vectors and DuckDB storage layout. + + +## Benchmarking (Replicating Paper Experiments) +In [BENCHMARKING.md](/BENCHMARKING.md) we detail how to replicate the experiments and benchmarks presented in our [publication](https://dl.acm.org/doi/pdf/10.1145/3626717). + +On the benchmarked datasets from our publication: +- ALP achieves on average **x3 compression ratios** (sometimes much much higher). +- ALP encodes on average 0.5 doubles per CPU cycle. +- ALP decodes on average 2.6 doubles per CPU cycle. + +On [FCBench](https://www.vldb.org/pvldb/vol17/p1418-tao.pdf): +- ALP achieves a compression ratio of 2.08 (beating all other compressors) diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt new file mode 100644 index 0000000..bbae241 --- /dev/null +++ b/benchmarks/CMakeLists.txt @@ -0,0 +1,30 @@ +message(STATUS "Fetching ZSTD ${ZSTD_REPOSITORY}") +if (${CMAKE_VERSION} VERSION_LESS "3.11") + message(FATAL_ERROR "CMake 3.11 required to fetch zlib") +endif () + +FetchContent_Declare( + zstd + GIT_REPOSITORY https://github.com/facebook/zstd + GIT_TAG 794ea1b0afca0f020f4e57b6732332231fb23c70) + +FetchContent_MakeAvailable(zstd) + +include_directories(${zstd_SOURCE_DIR}/lib) +option(ZSTD_BUILD_TESTS OFF) + +add_subdirectory(${zstd_SOURCE_DIR}/build/cmake ${zstd_BINARY_DIR}) + +add_subdirectory(test) +add_subdirectory(bench_speed) + + +# Benchmark Full Dataset : --------------------------------------------------------------------------------------------- +if (ALP_BUILD_BENCHMARKING_COMPRESSION_RATIO) + message("---------------------------------------------------------------------------------------------------------") + message("- Benchmark compression ratio:") + add_subdirectory(bench_compression_ratio) +endif () + + + diff --git a/benchmarks/analyze_better_blocks/.clang-format b/benchmarks/analyze_better_blocks/.clang-format new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/.clang-tidy b/benchmarks/analyze_better_blocks/.clang-tidy new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/.gitignore b/benchmarks/analyze_better_blocks/.gitignore new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/.projectile b/benchmarks/analyze_better_blocks/.projectile new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/.rtags b/benchmarks/analyze_better_blocks/.rtags new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/CMakeLists.txt b/benchmarks/analyze_better_blocks/CMakeLists.txt new file mode 100644 index 0000000..c6879cf --- /dev/null +++ b/benchmarks/analyze_better_blocks/CMakeLists.txt @@ -0,0 +1,134 @@ +# --------------------------------------------------------------------------- +# BtrBlocks +# --------------------------------------------------------------------------- +project(Analyze_better_blocks) +cmake_minimum_required(VERSION 3.22) + +# --------------------------------------------------------------------------- +# Environment +# --------------------------------------------------------------------------- + +set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/") +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_C_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread") +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -pthread") +set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -march=native") +set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -g") +add_compile_options(-march=native) + +if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang" AND CMAKE_BUILD_TYPE MATCHES Debug) + add_compile_options(-fstandalone-debug) +endif () + +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) + +if (APPLE) + list(APPEND CMAKE_PREFIX_PATH /usr/local/opt/bison) + list(APPEND CMAKE_PREFIX_PATH /usr/local/opt/flex) +endif (APPLE) + +if (CYGWIN) + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static-libstdc++") +endif (CYGWIN) + +option(NO_SIMD "NO_SIMD" OFF) + +# --------------------------------------------------------------------------- +# Dependencies +# --------------------------------------------------------------------------- + +find_package(Threads REQUIRED) + +set(THREADS_PREFER_PTHREAD_FLAG ON) + +include("${CMAKE_SOURCE_DIR}/vendor/aws-sdk.cmake") +include("${CMAKE_SOURCE_DIR}/vendor/benchmark.cmake") +#include("${CMAKE_SOURCE_DIR}/vendor/googletest.cmake") +include("${CMAKE_SOURCE_DIR}/vendor/gflags.cmake") +#include("${CMAKE_SOURCE_DIR}/vendor/rapidjson.cmake") +include("${CMAKE_SOURCE_DIR}/vendor/yaml-cpp.cmake") +include("${CMAKE_SOURCE_DIR}/cmake/clang-tidy.cmake") +include("${CMAKE_SOURCE_DIR}/vendor/fastpfor.cmake") +include("${CMAKE_SOURCE_DIR}/vendor/spdlog.cmake") +include("${CMAKE_SOURCE_DIR}/vendor/tbb.cmake") +include("${CMAKE_SOURCE_DIR}/vendor/croaring.cmake") +include("${CMAKE_SOURCE_DIR}/vendor/gdouble.cmake") +include("${CMAKE_SOURCE_DIR}/vendor/turbo.cmake") +include("${CMAKE_SOURCE_DIR}/vendor/lz4.cmake") +include("${CMAKE_SOURCE_DIR}/vendor/fsst.cmake") + +# --------------------------------------------------------------------------- +# Includes +# --------------------------------------------------------------------------- +include_directories(shared-headers) + +include("${CMAKE_SOURCE_DIR}/shared-headers/local.cmake") +include_directories( + ${CMAKE_SOURCE_DIR}/include + ${FLEX_INCLUDE_DIRS} + ${GTEST_INCLUDE_DIR} + ${GFLAGS_INCLUDE_DIR} +) + +# --------------------------------------------------------------------------- +# Sources +# --------------------------------------------------------------------------- + +include("${CMAKE_SOURCE_DIR}/playground/local.cmake") +add_subdirectory(cengine) +add_subdirectory(tools) + +# --------------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------------- + +# Tester does not link correctly because of tzt_stats +add_subdirectory(test) + +# --------------------------------------------------------------------------- +# Benchmarks +# --------------------------------------------------------------------------- + +include("${CMAKE_SOURCE_DIR}/benchmark/local.cmake") + +# --------------------------------------------------------------------------- +# Executable +# --------------------------------------------------------------------------- + +# add_executable(harbook harbook.cpp) +# add_dependencies(harbook cengine) +# target_link_libraries(harbook cengine psql) +# target_include_directories(harbook PRIVATE ${CENGINE_INCLUDE_DIR}) +# --------------------------------------------------------------------------- +# Linting +# --------------------------------------------------------------------------- + +add_custom_target(lint) +add_dependencies(lint ${lint_targets}) + +# --------------------------------------------------------------------------- +# Configuration +# --------------------------------------------------------------------------- + +message(STATUS "[compiler] settings") +message(STATUS " CMAKE_CXX_FLAGS = ${CMAKE_CXX_FLAGS}") +message(STATUS " CMAKE_CXX_FLAGS_RELEASE = ${CMAKE_CXX_FLAGS_RELEASE}") +message(STATUS "[cengine] settings") +message(STATUS " GFLAGS_INCLUDE_DIR = ${GFLAGS_INCLUDE_DIR}") +message(STATUS " GFLAGS_LIBRARY_PATH = ${GFLAGS_LIBRARY_PATH}") +message(STATUS "[TEST] settings") +message(STATUS " GTEST_INCLUDE_DIR = ${GTEST_INCLUDE_DIR}") +message(STATUS " GTEST_LIBRARY_PATH = ${GTEST_LIBRARY_PATH}") +message(STATUS " CMAKE_BUILD_TYPE = ${CMAKE_BUILD_TYPE}") +message(STATUS " CMAKE_C_COMPILER = ${CMAKE_C_COMPILER}") +message(STATUS " CMAKE_CXX_COMPILER = ${CMAKE_CXX_COMPILER}") +message(STATUS " CMAKE_C_FLAGS_RELEASE = ${CMAKE_C_FLAGS_RELEASE}") +message(STATUS " CMAKE_CXX_STANDARD = ${CMAKE_CXX_STANDARD}") +message(STATUS " CMAKE_C_STANDARD = ${CMAKE_C_STANDARD}") + +# --------------------------------------------------------------------------- +# ALP +# --------------------------------------------------------------------------- +include("${CMAKE_SOURCE_DIR}/alp/manual/CMakeLists.txt") \ No newline at end of file diff --git a/benchmarks/analyze_better_blocks/README.md b/benchmarks/analyze_better_blocks/README.md new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/alp/manual/CMakeLists.txt b/benchmarks/analyze_better_blocks/alp/manual/CMakeLists.txt new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/alp/manual/bench_ped.cpp b/benchmarks/analyze_better_blocks/alp/manual/bench_ped.cpp new file mode 100644 index 0000000..2755c95 --- /dev/null +++ b/benchmarks/analyze_better_blocks/alp/manual/bench_ped.cpp @@ -0,0 +1,184 @@ +#include "bench_ped.hpp" +#include "PerfEvent.hpp" +#include "Units.hpp" +#include "datablock/schemes/CScheme.hpp" +#include "datablock/schemes/CSchemePicker.hpp" +#include "datablock/schemes/CSchemePool.hpp" +#include "datablock/schemes/v2/double/Decimal.hpp" +#include "datablock/schemes/v2/double/DoubleBP.hpp" +#include "datablock/schemes/v2/double/DynamicDictionary.hpp" +#include "datablock/schemes/v2/double/Frequency.hpp" +#include "datablock/schemes/v2/double/RLE.hpp" +#include "datablock/schemes/v2/integer/PBP.hpp" +#include "gflags/gflags.h" +#include "include/datasets.hpp" +#include "include/datasets_complete.hpp" +#include "spdlog/fmt/bundled/ranges.h" +#include "spdlog/spdlog.h" +#include +#include +#include +#include + +// for some reason, this is only DECLARED in DynamicDictionary but not defined (breaks linking) +// and then DEFINED in every cpp file that uses it +DEFINE_string(fsst_stats, "", ""); +DEFINE_string(file_list_file, "pbi-double-columns.txt", "file-list"); +DEFINE_int32(cascade_depth, 1, "cascade"); + +static __attribute__((noinline)) benchmark::BenchmarkReporter::Run +bench_decoding_ped(dataset::Dataset& dataset, // + cengine::db::v2::d::Decimal& pd, // + cengine::db::DoubleStats& stats, // + double* dst, // + uint8_t* compressed_arr, // + size_t cascade) { + + int benchmark_number = dataset.id; + +#ifdef NDEBUG + uint64_t iterations = 300000; +#else + uint64_t iterations = 1; +#endif + + std::string benchmark_name = dataset.name + "_decode"; + + uint64_t cycles = benchmark::cycleclock::Now(); + for (uint64_t j = 0; j < iterations; ++j) { + pd.decompress(dst, nullptr, compressed_arr, stats.tuple_count, cascade); + } + + cycles = benchmark::cycleclock::Now() - cycles; + + return benchmark::BenchmarkReporter::Run( + benchmark_number, benchmark_name, iterations, double(cycles) / (double(iterations) * 1024)); +} + +static __attribute__((noinline)) benchmark::BenchmarkReporter::Run +bench_encoding_ped(dataset::Dataset& dataset, // + cengine::db::v2::d::Decimal& pd, // + double* dbl_arr, // + uint8_t* compressed_arr, // + cengine::db::DoubleStats& stats, + size_t cascade) { + + int benchmark_number = dataset.id; + +#ifdef NDEBUG + uint64_t iterations = 300; +#else + uint64_t iterations = 1; +#endif + + std::string benchmark_name = dataset.name + "_encode"; + size_t output_bytes {0}; + + uint64_t cycles = benchmark::cycleclock::Now(); + for (uint64_t j = 0; j < iterations; ++j) { + output_bytes = pd.compress(dbl_arr, nullptr, compressed_arr, stats, cascade); + } + + cycles = benchmark::cycleclock::Now() - cycles; + + return benchmark::BenchmarkReporter::Run( + output_bytes, benchmark_name, iterations, double(cycles) / (double(iterations) * 1024)); +} +void setupSchemePool() { + using namespace cengine::db; + cengine::db::CSchemePool::refresh(); + auto& schemes = *cengine::db::CSchemePool::available_schemes; + + // for (auto& scheme : schemes.double_schemes) { + // std::cout << ConvertSchemeTypeToString(scheme.first) << std::endl; + // } + // for (auto& scheme : schemes.integer_schemes) { + // std::cout << ConvertSchemeTypeToString(scheme.first) << std::endl; + // } + + // double: DOUBLE_BP, UNCOMPRESSED, + for (auto it = schemes.double_schemes.begin(); it != schemes.double_schemes.end();) { + if (it->first != DoubleSchemeType::DOUBLE_BP // + && it->first != DoubleSchemeType::UNCOMPRESSED // + && it->first != DoubleSchemeType::ONE_VALUE // + ) { + it = schemes.double_schemes.erase(it); + } else { + ++it; + } + } + + // int: X_FBP, UNCOMPRESSED, + for (auto it = schemes.integer_schemes.begin(); it != schemes.integer_schemes.end();) { + if (it->first != IntegerSchemeType::X_PBP // + && it->first != IntegerSchemeType::UNCOMPRESSED // + // && it->first != IntegerSchemeType::ONE_VALUE // + ) { + it = schemes.integer_schemes.erase(it); + } else { + ++it; + } + } +} + +void benchmark_all(benchmark::Benchmark& benchmark) { + + uint8_t* compressed_arr; + double* dbl_arr; + double* dec_dbl_arr; + + dbl_arr = new (std::align_val_t {64}) double[1024]; + dec_dbl_arr = new (std::align_val_t {64}) double[1024 * 100]; + compressed_arr = new (std::align_val_t {64}) uint8_t[1024 * 1000000000]; + + for (auto& dataset : dataset::datasets) { + std::ifstream ifile(dataset.file_path, std::ios::in); + + // check to see that the file was opened correctly: + if (!ifile.is_open()) { + std::cerr << "There was a problem opening the input file!\n"; + exit(1); // exit or do additional error checking + } + + double num = 0.0; + // keep storing values from the text file so long as data exists: + size_t c {0}; + while (ifile >> num) { + dbl_arr[c] = num; + c += 1; + } + + /* Init encoding */ + setupSchemePool(); + cengine::db::v2::d::Decimal pd; + size_t cascade = 2; + size_t output_bytes; + size_t size = 1024; + std::vector dst(size * 2, 0); + cengine::db::DoubleStats stats(dbl_arr, nullptr, size); + stats = cengine::db::DoubleStats::generateStats(dbl_arr, nullptr, size); + + /* Benchmark Encoding */ + benchmark.Run(bench_encoding_ped(dataset, pd, dbl_arr, compressed_arr, stats, cascade)); + + /* Encode */ + output_bytes = pd.compress(dbl_arr, nullptr, compressed_arr, stats, cascade); + + // Init decoding + + // Benchmark decoding + benchmark.Run(bench_decoding_ped(dataset, pd, stats, dst.data(), compressed_arr, cascade)); + + ifile.close(); + } +} + +int main() { + benchmark::Benchmark benchmark = + benchmark::create("ped") + .save() + .at(std::string(SOURCE_DIR) + "/fls_pub/results/" + benchmark::CmakeInfo::getCmakeToolchainFile()) + .print() + .add_extra_info(benchmark::CmakeInfo::getCmakeInfo()); + benchmark_all(benchmark); +} diff --git a/benchmarks/analyze_better_blocks/alp/manual/duplicate.cpp b/benchmarks/analyze_better_blocks/alp/manual/duplicate.cpp new file mode 100644 index 0000000..0444895 --- /dev/null +++ b/benchmarks/analyze_better_blocks/alp/manual/duplicate.cpp @@ -0,0 +1,154 @@ +#include "PerfEvent.hpp" +#include "Units.hpp" +#include "datablock/schemes/CScheme.hpp" +#include "datablock/schemes/CSchemePicker.hpp" +#include "datablock/schemes/CSchemePool.hpp" +#include "datablock/schemes/v2/double/Decimal.hpp" +#include "datablock/schemes/v2/double/DoubleBP.hpp" +#include "datablock/schemes/v2/double/DynamicDictionary.hpp" +#include "datablock/schemes/v2/double/Frequency.hpp" +#include "datablock/schemes/v2/double/RLE.hpp" +#include "datablock/schemes/v2/integer/PBP.hpp" +#include "gflags/gflags.h" +#include "spdlog/fmt/bundled/ranges.h" +#include "spdlog/spdlog.h" +#include +#include +#include + +// for some reason, this is only DECLARED in DynamicDictionary but not defined (breaks linking) +// and then DEFINED in every cpp file that uses it +DEFINE_string(fsst_stats, "", ""); +DEFINE_string(file_list_file, "pbi-double-columns.txt", "file-list"); +DEFINE_int32(cascade_depth, 1, "cascade"); + +// example2.double: s3://public-bi-benchmark/binary/Telco/1/Telco_1/106_RECHRG_INC_MIN_USED_P1.double +// example2.bitmap: s3://public-bi-benchmark/binary/Telco/1/Telco_1/106_RECHRG_INC_MIN_USED_P1.bitmap + +struct InputFiles { + std::ifstream list; + InputFiles(const std::string& filename) + : list(filename) { + std::cout << "file " << filename << std::endl; + } + + bool next(std::string& output) { return !(std::getline(list, output).fail()); } +}; + +std::string ensure_file(const std::string& object) { return object; } + +using T = double; +bool test_compression( + cengine::db::DoubleScheme& scheme, cengine::db::DoubleStats& stats, T* src, size_t size, PerfEvent& e, u8 cascade) { + std::vector compressed(size * sizeof(T) * 2); + std::vector dst(size * 2, 0); + + auto src_ptr = src; + auto compressed_ptr = reinterpret_cast(compressed.data()); + auto dst_ptr = dst.data(); + + size_t output_bytes {0}; + e.setParam("cascade", cascade); + e.setParam("phase", "compression"); + { + PerfEventBlock blk(e, size); + output_bytes = scheme.compress(src_ptr, nullptr, compressed_ptr, stats, cascade); + // std::cout << "cf: " << 1.0 * size * sizeof(T) / output_bytes << std::endl; + e.setParam("compr", output_bytes / (1.0 * size * sizeof(T)) * 64); + } + + // e.setParam("phase", "decompression"); + { + // PerfEventBlock blk(e, size); + scheme.decompress(dst_ptr, nullptr, compressed_ptr, stats.tuple_count, cascade); + } + // std::cerr << "Decompression done." << std::endl; + for (auto i = 0ul; i != size; ++i) { + die_if(src[i] == dst[i]); + } + + return 0; +} + +void setupSchemePool() { + using namespace cengine::db; + cengine::db::CSchemePool::refresh(); + auto& schemes = *cengine::db::CSchemePool::available_schemes; + return; + // double: DOUBLE_BP, UNCOMPRESSED, + for (auto it = schemes.double_schemes.begin(); it != schemes.double_schemes.end();) { + if (it->first != DoubleSchemeType::DOUBLE_BP && it->first != DoubleSchemeType::UNCOMPRESSED) { + it = schemes.double_schemes.erase(it); + } else { + ++it; + } + } + // int: X_FBP, UNCOMPRESSED, + for (auto it = schemes.integer_schemes.begin(); it != schemes.integer_schemes.end();) { + if (it->first != IntegerSchemeType::X_FBP && it->first != IntegerSchemeType::UNCOMPRESSED) { + it = schemes.integer_schemes.erase(it); + } else { + ++it; + } + } +} + +int main(int argc, char* argv[]) { + gflags::ParseCommandLineFlags(&argc, &argv, true); + setupSchemePool(); + spdlog::set_level(spdlog::level::off); + PerfEvent perf; + std::cerr << "using cascade depth " << FLAGS_cascade_depth << " and input file " << FLAGS_file_list_file + << std::endl; + + InputFiles file_list(FLAGS_file_list_file); + + std::string nextfile; + while (file_list.next(nextfile)) { + std::string outfile = ensure_file(nextfile); + + Vector doubles(outfile.c_str()); + + { + std::vector head(doubles.data, doubles.data + std::min(10ul, doubles.size())); + // spdlog::info("size: {:03.2f} MiB, head: {}", (sizeof(T) * doubles.size()) * 1.0 / 1024 / 1024, + //head); + } + + perf.setParam("column", nextfile); + // perf.setParam("scheme", "none"); + // perf.setParam("compr", 1); + // perf.setParam("cascade", 0); + + cengine::db::DoubleStats stats(doubles.data, nullptr, doubles.size()); + // perf.setParam("phase", "stats"); + { + // PerfEventBlock blk(perf, doubles.size()); + stats = cengine::db::DoubleStats::generateStats(doubles.data, nullptr, doubles.size()); + } + + // perf.setParam("scheme", "bitpack"); + // cengine::db::v2::d::DoubleBP bp; + // test_compression(bp, stats, doubles.data, doubles.count, perf, 0); + + perf.setParam("scheme", "decimal"); + cengine::db::v2::d::Decimal pd; + test_compression(pd, stats, doubles.data, doubles.count, perf, 2); + // test_compression(pd, stats, doubles.data, doubles.count, perf, 2); + + // perf.setParam("scheme", "dict"); + // cengine::db::v2::d::DynamicDictionary dict; + // test_compression(dict, stats, doubles.data, doubles.count, perf, 1); + // test_compression(dict, stats, doubles.data, doubles.count, perf, 2); + // + // perf.setParam("scheme", "rle"); + // cengine::db::v2::d::RLE rle; + // test_compression(rle, stats, doubles.data, doubles.count, perf, 1); + // test_compression(dict, stats, doubles.data, doubles.count, perf, 2); + + // perf.setParam("scheme", "freq"); + // cengine::db::v2::d::Frequency freq; + // test_compression(freq, stats, doubles.data, doubles.count, perf, 1); + // test_compression(freq, stats, doubles.data, doubles.count, perf, 2); + } +} diff --git a/benchmarks/analyze_better_blocks/alp/manual/include/alp.hpp b/benchmarks/analyze_better_blocks/alp/manual/include/alp.hpp new file mode 100644 index 0000000..846d78b --- /dev/null +++ b/benchmarks/analyze_better_blocks/alp/manual/include/alp.hpp @@ -0,0 +1,1159 @@ +#ifndef ALP_HPP +#define ALP_HPP + +#pragma once +#include "common.hpp" +#include "config.hpp" +#include +#include +#include +#include +#include +#include +#include + +#ifdef __AVX2__ +#include +#endif + +/* + * use magic_number . + * in case of exception -> the prev digits. + * rounding idea + * new idea + * */ + +namespace alp { + +/* + * scalar version of double_to_int64 + */ +int64_t double_to_int64(double x) { + double magic_number = static_cast(0x001800000000000); + x = x + magic_number; + return static_cast(x) - static_cast(magic_number); +} + +inline static double frac_arr[] = { + 1.0, + 0.1, + 0.01, + 0.001, + 0.0001, + 0.00001, + 0.000001, + 0.0000001, + 0.00000001, + 0.000000001, + 0.0000000001, + 0.00000000001, + 0.000000000001, + 0.0000000000001, + 0.00000000000001, + 0.000000000000001, + 0.0000000000000001, + 0.00000000000000001, + 0.000000000000000001, + 0.0000000000000000001, + 0.00000000000000000001, +}; + +const int64_t fact_arr[] = {1, + 10, + 100, + 1000, + 10000, + 100000, + 1000000, + 10000000, + 100000000, + 1000000000, + 10000000000, + 100000000000, + 1000000000000, + 10000000000000, + 100000000000000, + 1000000000000000, + 10000000000000000, + 100000000000000000, + 1000000000000000000}; + +const int64_t u_fact_arr[] = {1, + 10, + 100, + 1000, + 10000, + 100000, + 1000000, + 10000000, + 100000000, + 1000000000, + 10000000000, + 100000000000, + 1000000000000, + 10000000000000, + 100000000000000, + 1000000000000000, + 10000000000000000, + 100000000000000000, + 1000000000000000000}; + +const double exp_arr[] = { + 1.0, + 10.0, + 100.0, + 1000.0, + 10000.0, + 100000.0, + 1000000.0, + 10000000.0, + 100000000.0, + 1000000000.0, + 10000000000.0, + 100000000000.0, + 1000000000000.0, + 10000000000000.0, + 100000000000000.0, + 1000000000000000.0, + 10000000000000000.0, + 100000000000000000.0, + 1000000000000000000.0, + 10000000000000000000.0, + 100000000000000000000.0, + 1000000000000000000000.0, + 10000000000000000000000.0, + 100000000000000000000000.0, +}; + +void bit_set(uint64_t& unit, uint16_t pos) { + if (pos == 0) { + unit |= 1ULL; + } else { + unit |= 1ULL << pos; + } +} + +void bitmap_patch(uint16_t* pos_p, uint64_t* bitmap, uint16_t* exp_c_p) { + uint16_t exc_c = exp_c_p[0]; + for (size_t i {0}; i < 16; ++i) { + bitmap[i] = 0; + } + + for (auto i {0}; i < exc_c; ++i) { + auto x = pos_p[i] / 64; + auto y = pos_p[i] % 64; + bit_set(bitmap[x], y); + } +} + +common::Decimal encode_single(const double input, common::State& state) { + int exp = state.dataset.exponent; + // bool neg = input < 0; + // double dbl = neg ? -input : input; + double dbl = input; + int64_t tmp_digit {0}; + + if (input == -0.0 && std::signbit(input)) { return {state.digits, common::exp_exception, input}; } + + // Attempt conversion + double cd = dbl / common::frac10[exp]; + tmp_digit = double_to_int64(cd); + tmp_digit = round(double(tmp_digit) / common::factor_arr[state.dataset.factor]); + double orig = static_cast(tmp_digit * common::factor_arr[state.dataset.factor]) * common::frac10[exp]; + if (orig == dbl) { + state.digits = tmp_digit; + return {state.digits, exp, 0}; + } + + state.dataset.exceptions_count++; + return {state.digits, common::exp_exception, input}; +} + +double decode_single(common::Decimal encoded, common::State& state) { + if (encoded.exp == 23) { return encoded.patch; } + + return static_cast(encoded.digits * common::factor_arr[state.dataset.factor]) * + common::frac10[state.dataset.exponent]; +} + +void encode_patch_v0(const double* in_p, + double* exc_p, + uint16_t* pos_p, + uint16_t* exp_c_p, + uint64_t* bitmap, + int64_t* dig, + uint8_t fac, + uint8_t exp) { + + int64_t tmp_digit {0}; + int64_t scl_digit {0}; + uint16_t exc_c {0}; + + for (size_t i {0}; i < 1024; ++i) { + auto dbl = in_p[i]; + // todo + // if (dbl == -0.0 && std::signbit(dbl)) { + // exc_p[exc_c] = dbl; + // pos_p[exc_c] = i; + // exc_c = exc_c + 1; + // } + + // Attempt conversion + double cd = dbl / frac_arr[exp]; + tmp_digit = static_cast(round(cd)); + double orig = static_cast(tmp_digit) * frac_arr[exp]; + + if (orig == dbl) { + tmp_digit = round(double(tmp_digit) / fact_arr[fac]); + orig = static_cast(tmp_digit * fact_arr[fac]) * frac_arr[exp]; + if (orig == dbl) { + dig[i] = scl_digit = tmp_digit; + + continue; + } + } + + dig[i] = scl_digit; + exc_p[exc_c] = dbl; + pos_p[exc_c] = i; + exc_c = exc_c + 1; + } + + *exp_c_p = exc_c; + if (exc_c > 64 && hybrid_patching_is_enabled) { bitmap_patch(pos_p, bitmap, exp_c_p); } +} + +/* + * it's not faster. + */ + +void encode_patch_v1(const double* in_p, + double* exc_p, + uint16_t* pos_p, + uint16_t* exp_c_p, + uint64_t* bitmap, + int64_t* dig, + uint8_t fac, + uint8_t exp) { + + int64_t tmp_digit {0}; + int64_t scl_digit {0}; + uint16_t exc_c {0}; + + for (size_t i {0}; i < 1024; ++i) { + auto dbl = in_p[i]; + + double cd = dbl / frac_arr[exp]; + tmp_digit = double_to_int64(cd); + double orig = static_cast(tmp_digit) * frac_arr[exp]; + + if (orig == dbl) { + tmp_digit = round(static_cast(tmp_digit) / fact_arr[fac]); + orig = static_cast(tmp_digit * fact_arr[fac]) * frac_arr[exp]; + if (orig == dbl) { + dig[i] = scl_digit = tmp_digit; + continue; + } + } + + dig[i] = scl_digit; + exc_p[exc_c] = dbl; + pos_p[exc_c] = i; + exc_c = exc_c + 1; + } + + *exp_c_p = exc_c; + if (exc_c > 64 && hybrid_patching_is_enabled) { bitmap_patch(pos_p, bitmap, exp_c_p); } +} + +/* + * it's not faster. + */ + +void encode_patch_v2(const double* in_p, + double* exc_p, + uint16_t* pos_p, + uint16_t* exp_c_p, + uint64_t* bitmap, + int64_t* dig, + uint8_t fac, + uint8_t exp) { + + int64_t tmp_digit {0}; + int64_t scl_digit {0}; + uint16_t exc_c {0}; + + for (size_t i {0}; i < 1024; ++i) { + auto dbl = in_p[i]; + + double cd = dbl / frac_arr[exp]; + tmp_digit = double_to_int64(cd); + double orig = static_cast(tmp_digit) * frac_arr[exp]; + + if (orig == dbl) { + tmp_digit = double_to_int64(static_cast(tmp_digit) / fact_arr[fac]); + orig = static_cast(tmp_digit * fact_arr[fac]) * frac_arr[exp]; + if (orig == dbl) { + dig[i] = scl_digit = tmp_digit; + continue; + } + } + + dig[i] = scl_digit; + exc_p[exc_c] = dbl; + pos_p[exc_c] = i; + exc_c = exc_c + 1; + } + + *exp_c_p = exc_c; + if (exc_c > 64 && hybrid_patching_is_enabled) { bitmap_patch(pos_p, bitmap, exp_c_p); } +} + +/* + * towards SIMDized compression + */ +void encode_patch_v3(const double* in_p, + double* exc_p, + uint16_t* pos_p, + uint16_t* exp_c_p, + uint64_t* bitmap, + int64_t* dig, + uint8_t fac, + uint8_t exp) { + + int64_t tmp_digit {0}; + uint16_t exc_c {0}; + + for (size_t i {0}; i < 1024; ++i) { + auto dbl = in_p[i]; + + double cd = dbl / frac_arr[exp]; + tmp_digit = double_to_int64(cd); + tmp_digit = double_to_int64(static_cast(tmp_digit) / fact_arr[fac]); + double orig = static_cast(tmp_digit * fact_arr[fac]) * frac_arr[exp]; + dig[i] = tmp_digit; + + if (orig != dbl) { + dig[i] = dig[0]; + exc_p[exc_c] = dbl; + pos_p[exc_c] = i; + exc_c = exc_c + 1; + } + } + + *exp_c_p = exc_c; + if (exc_c > 64 && hybrid_patching_is_enabled) { bitmap_patch(pos_p, bitmap, exp_c_p); } +} + +void bitmap_patch(double* in_p, double* exp_p, uint16_t* pos_p, uint16_t* exp_c_p, uint64_t* bitmap) { + int c {0}; + uint64_t unit; + for (size_t i {0}; i < 16; ++i) { + unit = bitmap[i]; + while (unit != 0) { + uint64_t t = unit & -unit; + int r = __builtin_ctzll(unit); + in_p[i * 64 + r] = exp_p[c++]; + unit ^= t; + } + } +} + +void patch(double* in_p, double* exp_p, uint16_t* pos_p, uint16_t* exp_c_p, uint64_t* bitmap) { + auto exp_c = exp_c_p[0]; + + if (exp_c > 64 && hybrid_patching_is_enabled) { + bitmap_patch(in_p, exp_p, pos_p, exp_c_p, bitmap); + return; + } + + for (uint16_t i {0}; i < exp_c; ++i) { + in_p[pos_p[i]] = exp_p[i]; + } +} + +void analyze_ffor(int64_t* in_p, uint8_t& bw, int64_t* base_p) { + auto min = std::numeric_limits::max(); + auto max = std::numeric_limits::min(); + + for (size_t i {0}; i < 1024; ++i) { + if (in_p[i] < min) { min = in_p[i]; } + if (in_p[i] > max) { max = in_p[i]; } + } + + uint64_t delta = (static_cast(max) - static_cast(min)); + auto bits_per_digit = ceil(log2(delta + 1)); + bw = bits_per_digit; + base_p[0] = min; +} + +/* + * Best of two v0 and v2 + * Unoptimized + * + */ + +void encode_patch_v4(const double* in_p, + double* exc_p, + uint16_t* pos_p, + uint16_t* exp_c_p, + uint64_t* bitmap, + int64_t* dig, + uint8_t fac, + uint8_t exp) { + + int64_t tmp_digit {0}; + int64_t scl_digit {0}; + uint16_t exc_c {0}; + double cd {0}; + double orig {0}; + + for (size_t i {0}; i < 1024; ++i) { + auto dbl = in_p[i]; + + // Attempt conversion + cd = dbl / frac_arr[exp]; + tmp_digit = static_cast(round(cd)); + orig = static_cast(tmp_digit) * frac_arr[exp]; + + if (orig == dbl) { + tmp_digit = round(double(tmp_digit) / fact_arr[fac]); + orig = static_cast(tmp_digit * fact_arr[fac]) * frac_arr[exp]; + if (orig == dbl) { + dig[i] = scl_digit = tmp_digit; + + continue; + } + } + + cd = dbl / frac_arr[exp]; + tmp_digit = double_to_int64(cd); + orig = static_cast(tmp_digit) * frac_arr[exp]; + + if (orig == dbl) { + tmp_digit = double_to_int64(static_cast(tmp_digit) / fact_arr[fac]); + orig = static_cast(tmp_digit * fact_arr[fac]) * frac_arr[exp]; + if (orig == dbl) { + dig[i] = scl_digit = tmp_digit; + continue; + } + } + + dig[i] = scl_digit; + exc_p[exc_c] = dbl; + pos_p[exc_c] = i; + exc_c = exc_c + 1; + } + + *exp_c_p = exc_c; + if (exc_c > 64 && hybrid_patching_is_enabled) { bitmap_patch(pos_p, bitmap, exp_c_p); } +} + +/* + * Ceil and Floor technique + * Unoptimized + * + */ +void encode_patch_v5(const double* in_p, + double* exc_p, + uint16_t* pos_p, + uint16_t* exp_c_p, + uint64_t* bitmap, + int64_t* dig, + uint8_t fac, + uint8_t exp) { + + int64_t tmp_digit {0}; + int64_t scl_digit {0}; + uint16_t exc_c {0}; + double cd {0}; + double orig {0}; + + for (size_t i {0}; i < 1024; ++i) { + bool found = false; + auto dbl = in_p[i]; + + // Attempt convertion to FLOOR + cd = dbl / frac_arr[exp]; + tmp_digit = static_cast(floor(cd)); + orig = static_cast(tmp_digit) * frac_arr[exp]; + + if (orig == dbl) { + tmp_digit = floor(double(tmp_digit) / fact_arr[fac]); + orig = static_cast(tmp_digit * fact_arr[fac]) * frac_arr[exp]; + if (orig == dbl) { + dig[i] = scl_digit = tmp_digit; + continue; + } + } + + // Attempt convertion to CEIL + cd = dbl / frac_arr[exp]; + tmp_digit = static_cast(ceil(cd)); + orig = static_cast(tmp_digit) * frac_arr[exp]; + + if (orig == dbl) { + tmp_digit = ceil(double(tmp_digit) / fact_arr[fac]); + orig = static_cast(tmp_digit * fact_arr[fac]) * frac_arr[exp]; + if (orig == dbl) { + dig[i] = scl_digit = tmp_digit; + continue; + } + } + + dig[i] = scl_digit; + exc_p[exc_c] = dbl; + pos_p[exc_c] = i; + exc_c = exc_c + 1; + } + + *exp_c_p = exc_c; + if (exc_c > 64 && hybrid_patching_is_enabled) { bitmap_patch(pos_p, bitmap, exp_c_p); } +} + +void encode_patch_v7(const double* in_p, + double* exc_p, + uint16_t* pos_p, + uint16_t* exp_c_p, + uint64_t* bitmap, + int64_t* dig, + uint8_t fac, + uint8_t exp) { + + int64_t tmp_digit {0}; + int64_t scl_digit {0}; + uint16_t exc_c {0}; + double cd {0}; + double orig {0}; + + for (size_t i {0}; i < 1024; ++i) { + auto dbl = in_p[i]; + + cd = dbl * fact_arr[exp]; + tmp_digit = static_cast(round(dbl * fact_arr[exp])); + tmp_digit = round(tmp_digit / fact_arr[fac]); + orig = static_cast(tmp_digit * fact_arr[fac]) / static_cast(fact_arr[exp]); + if (orig == dbl) { + dig[i] = scl_digit = tmp_digit; + continue; + } + + dig[i] = scl_digit; + exc_p[exc_c] = dbl; + pos_p[exc_c] = i; + exc_c = exc_c + 1; + } + + *exp_c_p = exc_c; + if (exc_c > 64 && hybrid_patching_is_enabled) { bitmap_patch(pos_p, bitmap, exp_c_p); } +} + +void decode_v7(const uint64_t* digits, uint8_t factor_idx, uint8_t exp, double* out_p) { + + uint64_t factor = fact_arr[factor_idx]; + int64_t exponent = fact_arr[exp]; + + for (size_t i {0}; i < 1024; ++i) { + uint64_t digit = digits[i]; + out_p[i] = static_cast(static_cast(digit * factor)) / exponent; + // out_p[i] = static_cast(static_cast(digit * factor)) * frac_arr[exp]; + } +} + +/* + * Ceil and Floor technique + * Unoptimized + * Method: multiply/multiply + * + */ +void encode_patch_v8(const double* in_p, + double* exc_p, + uint16_t* pos_p, + uint16_t* exp_c_p, + uint64_t* bitmap, + int64_t* dig, + uint8_t fac, + uint8_t exp) { + + int64_t tmp_digit {0}; + int64_t scl_digit {0}; + uint16_t exc_c {0}; + double cd {0}; + double orig {0}; + + for (size_t i {0}; i < 1024; ++i) { + auto dbl = in_p[i]; + + // Attempt convertion to FLOOR + cd = dbl * exp_arr[exp]; + tmp_digit = static_cast(floor(cd)); + orig = static_cast(tmp_digit) * frac_arr[exp]; + + if (orig == dbl) { + tmp_digit = floor(double(tmp_digit) / fact_arr[fac]); + orig = static_cast(tmp_digit * fact_arr[fac]) * frac_arr[exp]; + if (orig == dbl) { + dig[i] = scl_digit = tmp_digit; + continue; + } + } + + // Attempt convertion to CEIL + cd = dbl * exp_arr[exp]; + tmp_digit = static_cast(ceil(cd)); + orig = static_cast(tmp_digit) * frac_arr[exp]; + + if (orig == dbl) { + tmp_digit = ceil(double(tmp_digit) / fact_arr[fac]); + orig = static_cast(tmp_digit * fact_arr[fac]) * frac_arr[exp]; + if (orig == dbl) { + dig[i] = scl_digit = tmp_digit; + continue; + } + } + + dig[i] = scl_digit; + exc_p[exc_c] = dbl; + pos_p[exc_c] = i; + exc_c = exc_c + 1; + } + + *exp_c_p = exc_c; + if (exc_c > 64 && hybrid_patching_is_enabled) { bitmap_patch(pos_p, bitmap, exp_c_p); } +} + +/* + * Ceil and Floor technique + * Unoptimized + * multiply/multiply + * second round divide/multiply + */ +void encode_patch_v9(const double* in_p, + double* exc_p, + uint16_t* pos_p, + uint16_t* exp_c_p, + uint64_t* bitmap, + int64_t* dig, + uint8_t fac, + uint8_t exp) { + + int64_t tmp_digit {0}; + int64_t scl_digit {0}; + uint16_t exc_c {0}; + double cd {0}; + double orig {0}; + + for (size_t i {0}; i < 1024; ++i) { + auto dbl = in_p[i]; + + // Attempt convertion with ** + cd = dbl * exp_arr[exp]; + tmp_digit = double_to_int64(cd); + orig = static_cast(tmp_digit) * frac_arr[exp]; + + if (orig == dbl) { + tmp_digit = double_to_int64(double(tmp_digit) * frac_arr[fac]); + orig = static_cast(tmp_digit * fact_arr[fac]) * frac_arr[exp]; + if (orig == dbl) { + dig[i] = scl_digit = tmp_digit; + continue; + } + } + + // Attempt convertion with /* and CEIL + cd = dbl / frac_arr[exp]; + tmp_digit = static_cast(ceil(cd)); + orig = static_cast(tmp_digit) * frac_arr[exp]; + + if (orig == dbl) { + tmp_digit = ceil(double(tmp_digit) / fact_arr[fac]); + orig = static_cast(tmp_digit * fact_arr[fac]) * frac_arr[exp]; + if (orig == dbl) { + dig[i] = scl_digit = tmp_digit; + continue; + } + } + + // Attempt convertion with /* and FLOOR + cd = dbl / frac_arr[exp]; + tmp_digit = static_cast(floor(cd)); + orig = static_cast(tmp_digit) * frac_arr[exp]; + + if (orig == dbl) { + tmp_digit = floor(double(tmp_digit) / fact_arr[fac]); + orig = static_cast(tmp_digit * fact_arr[fac]) * frac_arr[exp]; + if (orig == dbl) { + dig[i] = scl_digit = tmp_digit; + continue; + } + } + + dig[i] = scl_digit; + exc_p[exc_c] = dbl; + pos_p[exc_c] = i; + exc_c = exc_c + 1; + } + + *exp_c_p = exc_c; + if (exc_c > 64 && hybrid_patching_is_enabled) { bitmap_patch(pos_p, bitmap, exp_c_p); } +} + +/* + * Almost everything with multiply + * Unoptimized + */ +void encode_patch_v10(const double* in_p, + double* exc_p, + uint16_t* pos_p, + uint16_t* exp_c_p, + uint64_t* bitmap, + int64_t* dig, + uint8_t fac, + uint8_t exp) { + + int64_t tmp_digit {0}; + int64_t tmp_digit_2 {0}; + int64_t tmp_digit_3 {0}; + int64_t tmp_digit_4 {0}; + int64_t scl_digit {0}; + uint16_t exc_c {0}; + double cd {0}; + double orig {0}; + + for (size_t i {0}; i < 1024; ++i) { + auto dbl = in_p[i]; + + // MULTIPLY + tmp_digit = double_to_int64(dbl * exp_arr[exp] * frac_arr[fac]); + orig = tmp_digit * fact_arr[fac] * frac_arr[exp]; + if (orig == dbl) { + dig[i] = scl_digit = tmp_digit; + continue; + } + + // ADD ONE TO POSSIBLE DOUBLE DUE TO ERRORS + tmp_digit_2 = tmp_digit + 1; + orig = tmp_digit_2 * frac_arr[exp] * frac_arr[fac]; + + if (orig == dbl) { + dig[i] = scl_digit = tmp_digit_2; + continue; + } + + tmp_digit_2 = tmp_digit - 1; + orig = tmp_digit_2 * frac_arr[exp] * frac_arr[fac]; + + if (orig == dbl) { + dig[i] = scl_digit = tmp_digit_2; + continue; + } + + // If number exceeds double integers threshold, then call NEXTAFTER + int neg = dbl < 0 ? -1 : 1; + tmp_digit_3 = std::nextafter(tmp_digit, DBL_MAX); + orig = tmp_digit_3 * frac_arr[exp] * frac_arr[fac]; + if (orig == dbl) { + dig[i] = scl_digit = tmp_digit_3; + continue; + } + + tmp_digit_3 = std::nextafter(tmp_digit, -DBL_MAX); + orig = tmp_digit_3 * frac_arr[exp] * frac_arr[fac]; + if (orig == dbl) { + dig[i] = scl_digit = tmp_digit_3; + continue; + } + + // Finally, try with /* + // This is only really used when exponent - factor > 16 + tmp_digit_4 = double(ceil(ceil(dbl / frac_arr[exp])) / fact_arr[fac]); + orig = (double)(tmp_digit_4 * fact_arr[fac]) * frac_arr[exp]; + if (orig == dbl) { + dig[i] = scl_digit = tmp_digit_4; + continue; + } + + tmp_digit_4 = double(floor(floor(dbl / frac_arr[exp])) / fact_arr[fac]); + orig = (double)(tmp_digit_4 * fact_arr[fac]) * frac_arr[exp]; + if (orig == dbl) { + dig[i] = scl_digit = tmp_digit_4; + continue; + } + + dig[i] = scl_digit; + exc_p[exc_c] = dbl; + pos_p[exc_c] = i; + exc_c = exc_c + 1; + } + + *exp_c_p = exc_c; + if (exc_c > 64 && hybrid_patching_is_enabled) { bitmap_patch(pos_p, bitmap, exp_c_p); } +} + +/* + * Almost everything with multiply + * Unoptimized + */ +void encode_patch_v11(const double* in_p, + double* exc_p, + uint16_t* pos_p, + uint16_t* exp_c_p, + uint64_t* bitmap, + int64_t* dig, + uint8_t fac, + uint8_t exp) { + + int64_t tmp_digit {0}; + int64_t scl_digit {0}; + uint16_t exc_c {0}; + double cd {0}; + double orig {0}; + + for (size_t i {0}; i < 1024; ++i) { + auto dbl = in_p[i]; + + // Multiplication **** + tmp_digit = double_to_int64(dbl * exp_arr[exp] * frac_arr[fac]); + orig = tmp_digit * fact_arr[fac] * frac_arr[exp]; + if (orig == dbl) { + dig[i] = scl_digit = tmp_digit; + continue; + } + + // Division //** + // This is only really used when tmp_digit > MAGIC_NUMBER to avoid rounding errors when multiplying + tmp_digit = round(dbl / frac_arr[exp] / fact_arr[fac]); + orig = tmp_digit * fact_arr[fac] * frac_arr[exp]; + if (orig == dbl) { + dig[i] = scl_digit = tmp_digit; + continue; + } + + dig[i] = scl_digit; + exc_p[exc_c] = dbl; + pos_p[exc_c] = i; + exc_c = exc_c + 1; + } + + *exp_c_p = exc_c; + if (exc_c > 64 && hybrid_patching_is_enabled) { bitmap_patch(pos_p, bitmap, exp_c_p); } +} + +/* + * SIMDIZED + * NOT - FINISHED + */ + +uint64_t tmp_arr[1024]; +double tmp_dbl_arr[1024]; +uint64_t index_arr[1024] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, + 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, + 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, + 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, + 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, + 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, + 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, + 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, + 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, + 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, + 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, + 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, + 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, + 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, + 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, + 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, + 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, + 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, + 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, + 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, + 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, + 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, + 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, + 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, + 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, + 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, + 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511, 512, + 513, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524, 525, 526, 527, 528, 529, 530, 531, + 532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546, 547, 548, 549, 550, + 551, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 562, 563, 564, 565, 566, 567, 568, 569, + 570, 571, 572, 573, 574, 575, 576, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 588, + 589, 590, 591, 592, 593, 594, 595, 596, 597, 598, 599, 600, 601, 602, 603, 604, 605, 606, 607, + 608, 609, 610, 611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623, 624, 625, 626, + 627, 628, 629, 630, 631, 632, 633, 634, 635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, + 646, 647, 648, 649, 650, 651, 652, 653, 654, 655, 656, 657, 658, 659, 660, 661, 662, 663, 664, + 665, 666, 667, 668, 669, 670, 671, 672, 673, 674, 675, 676, 677, 678, 679, 680, 681, 682, 683, + 684, 685, 686, 687, 688, 689, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699, 700, 701, 702, + 703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 714, 715, 716, 717, 718, 719, 720, 721, + 722, 723, 724, 725, 726, 727, 728, 729, 730, 731, 732, 733, 734, 735, 736, 737, 738, 739, 740, + 741, 742, 743, 744, 745, 746, 747, 748, 749, 750, 751, 752, 753, 754, 755, 756, 757, 758, 759, + 760, 761, 762, 763, 764, 765, 766, 767, 768, 769, 770, 771, 772, 773, 774, 775, 776, 777, 778, + 779, 780, 781, 782, 783, 784, 785, 786, 787, 788, 789, 790, 791, 792, 793, 794, 795, 796, 797, + 798, 799, 800, 801, 802, 803, 804, 805, 806, 807, 808, 809, 810, 811, 812, 813, 814, 815, 816, + 817, 818, 819, 820, 821, 822, 823, 824, 825, 826, 827, 828, 829, 830, 831, 832, 833, 834, 835, + 836, 837, 838, 839, 840, 841, 842, 843, 844, 845, 846, 847, 848, 849, 850, 851, 852, 853, 854, + 855, 856, 857, 858, 859, 860, 861, 862, 863, 864, 865, 866, 867, 868, 869, 870, 871, 872, 873, + 874, 875, 876, 877, 878, 879, 880, 881, 882, 883, 884, 885, 886, 887, 888, 889, 890, 891, 892, + 893, 894, 895, 896, 897, 898, 899, 900, 901, 902, 903, 904, 905, 906, 907, 908, 909, 910, 911, + 912, 913, 914, 915, 916, 917, 918, 919, 920, 921, 922, 923, 924, 925, 926, 927, 928, 929, 930, + 931, 932, 933, 934, 935, 936, 937, 938, 939, 940, 941, 942, 943, 944, 945, 946, 947, 948, 949, + 950, 951, 952, 953, 954, 955, 956, 957, 958, 959, 960, 961, 962, 963, 964, 965, 966, 967, 968, + 969, 970, 971, 972, 973, 974, 975, 976, 977, 978, 979, 980, 981, 982, 983, 984, 985, 986, 987, + 988, 989, 990, 991, 992, 993, 994, 995, 996, 997, 998, 999, 1000, 1001, 1002, 1003, 1004, 1005, 1006, + 1007, 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, +}; + +uint8_t lookup_table[256] = { + 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, + 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, + 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, + 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, + 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, + 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, + 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8, +}; + +uint64_t tmp_index[1024]; + +void encode_simdized(const double* in_p, + double* exc_p, + uint16_t* pos_p, + uint16_t* exp_c_p, + uint64_t* bitmap, + int64_t* dig, + uint8_t fac, + uint8_t exp) { + + int64_t tmp_digit {0}; + uint16_t exc_c {0}; + double cd {0}; + double orig {0}; + uint64_t pos {0}; + + double magic_number = static_cast(0x001800000000000); + +#pragma clang loop vectorize_width(64) + for (size_t i {0}; i <= 1023; ++i) { + auto dbl = in_p[i]; + + // Attempt conversion + cd = dbl * exp_arr[exp] * frac_arr[fac]; + cd = cd + magic_number; + tmp_digit = static_cast(cd) - static_cast(magic_number); + dig[i] = tmp_digit; + orig = static_cast(tmp_digit * fact_arr[fac]) * frac_arr[exp]; + tmp_dbl_arr[i] = orig; + } + +#ifdef __AVX512F__ + for (size_t i {0}; i <= 1023; i = i + 8) { + __m512d l = _mm512_loadu_pd(tmp_dbl_arr + i); + __m512d r = _mm512_loadu_pd(in_p + i); + __m512i index = _mm512_loadu_pd(index_arr + i); + auto compare_r = _mm512_cmpneq_pd_mask(l, r); + _mm512_mask_compressstoreu_pd(tmp_index + pos, compare_r, index); + pos += lookup_table[compare_r]; + } +#else + for (size_t i {0}; i <= 1023; ++i) { + auto l = tmp_dbl[i]; + auto r = in_p[i]; + auto compare_r = (l != r); + tmp_index[pos] = i; + pos += compare_r; + } +#endif + + int64_t for_sure; + for (size_t i {0}; i <= 1023; ++i) { + if (i != tmp_index[i]) { + for_sure = dig[i]; + break; + } + } + + for (size_t j {0}; j < pos; ++j) { + size_t i = tmp_index[j]; + auto dbl = in_p[i]; + + tmp_digit = round(dbl / frac_arr[exp] / fact_arr[fac]); + orig = tmp_digit * fact_arr[fac] * frac_arr[exp]; + if (orig == dbl) { + dig[i] = tmp_digit; + continue; + } + + dig[i] = for_sure; + exc_p[exc_c] = dbl; + pos_p[exc_c] = i; + exc_c = exc_c + 1; + } + + *exp_c_p = exc_c; + if (exc_c > 64 && hybrid_patching_is_enabled) { bitmap_patch(pos_p, bitmap, exp_c_p); } +} + +double magic_number = static_cast(0x001800000000000); + +struct state { + int smp_size {32}; + uint16_t rg_n_comb {5}; + std::vector> cmb_arr; + double smp_arr[1024]; + uint8_t exp; + uint8_t fac; +}; + +uint32_t smp_size = 32; + +void find_best_exponent_factor_from_combination(std::vector>& top_combinations, + uint8_t top_n, + const double* smp_arr, + uint8_t& factor, + uint8_t& exponent) { + uint8_t found_exponent {0}; + uint8_t found_factor {0}; + uint64_t previous_bit_count {0}; + uint8_t worse_threshold {0}; + + // For each top combination + for (size_t k {0}; k < top_n; k++) { + int exp_ref = top_combinations[k].first; + int factor_idx = top_combinations[k].second; + uint32_t exception_c {0}; + uint32_t matches_c {0}; + uint32_t bits_per_digit {0}; + uint64_t local_total_bits {0}; + int64_t local_max_digits {std::numeric_limits().min()}; + int64_t local_min_digits {std::numeric_limits().max()}; + + auto j {0}; + // Test combination of exponent and factor for the sample + for (size_t i = 0; i < smp_size; ++i) { + int64_t digits; + double dbl = smp_arr[j]; + j = j + 32; + + double orig; + double cd; + + cd = dbl * alp::exp_arr[exp_ref] * alp::frac_arr[factor_idx]; + digits = alp::double_to_int64(cd); + orig = digits * alp::fact_arr[factor_idx] * alp::frac_arr[exp_ref]; + if (orig == dbl) { + matches_c++; + if (digits > local_max_digits) { local_max_digits = digits; } + if (digits < local_min_digits) { local_min_digits = digits; } + } else { + exception_c++; + } + } + + // Evaluate factor/exponent performance (we optimize for FOR) + uint64_t delta = local_max_digits - local_min_digits; + bits_per_digit = ceil(log2(delta + 1)); + local_total_bits += smp_size * bits_per_digit; + local_total_bits += exception_c * (64 + 16); + + if (k == 0) { // First try with first combination + previous_bit_count = local_total_bits; + found_factor = factor_idx; + found_exponent = exp_ref; + continue; // Go to second + } + if (local_total_bits >= previous_bit_count) { // If current is worse or equal than previous + worse_threshold += 1; + if (worse_threshold == 2) { + break; // We stop only if two are worse + } + continue; + } + // Otherwise we replace best and continue with next + previous_bit_count = local_total_bits; + found_factor = factor_idx; + found_exponent = exp_ref; + worse_threshold = 0; + } + exponent = found_exponent; + factor = found_factor; +} + +void encode( + const double* in_p, double* exc_p, uint16_t* pos_p, uint16_t* exp_c_p, uint64_t* bitmap, int64_t* dig, state& stt) { + + int64_t tmp_digit {0}; + uint16_t exc_c {0}; + double cd {0}; + double orig {0}; + uint64_t pos {0}; + + if (stt.rg_n_comb > 1) { // Only if more than 1 found top combinations we sample and search + find_best_exponent_factor_from_combination(stt.cmb_arr, stt.rg_n_comb, in_p, stt.fac, stt.exp); + } else { + stt.exp = stt.cmb_arr[0].first; + stt.fac = stt.cmb_arr[0].second; + } + +#pragma clang loop vectorize_width(64) + for (size_t i {0}; i <= 1023; ++i) { + auto dbl = in_p[i]; + + // Attempt conversion + cd = dbl * exp_arr[stt.exp] * frac_arr[stt.fac]; + cd = cd + magic_number; + tmp_digit = static_cast(cd) - static_cast(magic_number); + dig[i] = tmp_digit; + orig = static_cast(tmp_digit * fact_arr[stt.fac]) * frac_arr[stt.exp]; + tmp_dbl_arr[i] = orig; + } + +#ifdef __AVX512F__ + for (size_t i {0}; i <= 1023; i = i + 8) { + __m512d l = _mm512_loadu_pd(tmp_dbl_arr + i); + __m512d r = _mm512_loadu_pd(in_p + i); + __m512i index = _mm512_loadu_pd(index_arr + i); + auto compare_r = _mm512_cmpneq_pd_mask(l, r); + _mm512_mask_compressstoreu_pd(tmp_index + pos, compare_r, index); + pos += lookup_table[compare_r]; + } +#else + for (size_t i {0}; i <= 1023; ++i) { + auto l = tmp_dbl[i]; + auto r = in_p[i]; + auto compare_r = (l != r); + tmp_index[pos] = i; + pos += compare_r; + } +#endif + + int64_t for_sure; + for (size_t i {0}; i <= 1023; ++i) { + if (i != tmp_index[i]) { + for_sure = dig[i]; + break; + } + } + + for (size_t j {0}; j < pos; ++j) { + size_t i = tmp_index[j]; + auto dbl = in_p[i]; + + tmp_digit = round(dbl / frac_arr[stt.exp] / fact_arr[stt.fac]); + orig = tmp_digit * fact_arr[stt.fac] * frac_arr[stt.exp]; + if (orig == dbl) { + dig[i] = tmp_digit; + continue; + } + + dig[i] = for_sure; + exc_p[exc_c] = dbl; + pos_p[exc_c] = i; + exc_c = exc_c + 1; + } + + *exp_c_p = exc_c; + if (exc_c > 64 && hybrid_patching_is_enabled) { bitmap_patch(pos_p, bitmap, exp_c_p); } +} + +} // namespace alp +#endif diff --git a/benchmarks/analyze_better_blocks/alp/manual/include/dataset.hpp b/benchmarks/analyze_better_blocks/alp/manual/include/dataset.hpp new file mode 100644 index 0000000..61b773c --- /dev/null +++ b/benchmarks/analyze_better_blocks/alp/manual/include/dataset.hpp @@ -0,0 +1,44 @@ +#ifndef DATASET_HPP +#define DATASET_HPP + +#include "string" +#include +#include + +namespace dataset { +struct Dataset { + uint64_t id; + std::string name; + std::string file_path; + std::string digits_file_path; + std::string exceptions_file_path; + int exponent; + int64_t factor_idx; + uint16_t exc_c; + double coverage; + double bits_per_value; + int64_t max_digit; + int64_t min_digit; + uint8_t bw; +}; + +struct paths { + std::string DATASETS_1024_SAMPLES_PATH = "../data/1024_data_samples/"; + std::string DATASETS_1024_DIGITS_PATH = "../data/1024_data_digits/"; + std::string DATASETS_1024_EXCEPTIONS_PATH = "../data/1024_data_exceptions/"; + std::string DATASETS_COMPLETE_PATH = "../data/data_for_c/"; + std::string ALP_DATA_DIR_PATH = "../data"; + explicit paths() { + if (auto v = std::getenv("DATASETS_1024_SAMPLES_PATH")) { DATASETS_1024_SAMPLES_PATH = v; } + if (auto v = std::getenv("DATASETS_1024_DIGITS_PATH")) { DATASETS_1024_DIGITS_PATH = v; } + if (auto v = std::getenv("DATASETS_1024_EXCEPTIONS_PATH")) { DATASETS_1024_EXCEPTIONS_PATH = v; } + if (auto v = std::getenv("DATASETS_COMPLETE_PATH")) { DATASETS_COMPLETE_PATH = v; } + if (auto v = std::getenv("ALP_DATA_DIR_PATH")) { ALP_DATA_DIR_PATH = v; } + } +}; + +paths PATHS; + +} // namespace dataset + +#endif \ No newline at end of file diff --git a/benchmarks/analyze_better_blocks/alp/manual/include/datasets.hpp b/benchmarks/analyze_better_blocks/alp/manual/include/datasets.hpp new file mode 100644 index 0000000..6defaf3 --- /dev/null +++ b/benchmarks/analyze_better_blocks/alp/manual/include/datasets.hpp @@ -0,0 +1,448 @@ +#ifndef DATASETS_HPP +#define DATASETS_HPP + +#include "dataset.hpp" +#include "string" +#include +#include + +namespace dataset { + +std::vector datasets = { + {1, + "arade4", + PATHS.DATASETS_1024_SAMPLES_PATH + "arade4.csv", // + PATHS.DATASETS_1024_DIGITS_PATH + "arade4.csv", + PATHS.DATASETS_1024_EXCEPTIONS_PATH + "arade4.csv", + 14, + 10, + 8, + 0, + 0, + 0, + 0, + 24}, + + {2, + "basel_temp_f", + PATHS.DATASETS_1024_SAMPLES_PATH + "basel_temp_f.csv", + PATHS.DATASETS_1024_DIGITS_PATH + "basel_temp_f.csv", + PATHS.DATASETS_1024_EXCEPTIONS_PATH + "basel_temp_f.csv", + 14, + 7, // + 47, + 0, + 0, + 0, + 0, + 28}, // 3 works better than 14 + + {3, + "basel_wind_f", + PATHS.DATASETS_1024_SAMPLES_PATH + "basel_wind_f.csv", + PATHS.DATASETS_1024_DIGITS_PATH + "basel_wind_f.csv", + PATHS.DATASETS_1024_EXCEPTIONS_PATH + "basel_wind_f.csv", + 14, + 7, // + 9, + 0, + 0, + 0, + 0, + 29}, // 3 works better than 14 + + {4, + "bird_migration_f", + PATHS.DATASETS_1024_SAMPLES_PATH + "bird_migration_f.csv", + PATHS.DATASETS_1024_DIGITS_PATH + "bird_migration_f.csv", + PATHS.DATASETS_1024_EXCEPTIONS_PATH + "bird_migration_f.csv", + 14, + 9, // + 2, + 0, + 0, + 0, + 0, + 17}, // 3 works better than 14 + + {5, + "bitcoin_f", + PATHS.DATASETS_1024_SAMPLES_PATH + "bitcoin_f.csv", + PATHS.DATASETS_1024_DIGITS_PATH + "bitcoin_f.csv", + PATHS.DATASETS_1024_EXCEPTIONS_PATH + "bitcoin_f.csv", + 14, + 10, // + 10, + 0, + 0, + 0, + 0, + 25}, // 3 works better than 14 + + {6, + "bitcoin_transactions_f", + PATHS.DATASETS_1024_SAMPLES_PATH + "bitcoin_transactions_f.csv", + PATHS.DATASETS_1024_DIGITS_PATH + "bitcoin_transactions_f.csv", + PATHS.DATASETS_1024_EXCEPTIONS_PATH + "bitcoin_transactions_f.csv", // + 14, + 10, + 11, + 0, + 0, + 0, + 0, + 30}, + + {7, + "city_temperature_f", + PATHS.DATASETS_1024_SAMPLES_PATH + "city_temperature_f.csv", + PATHS.DATASETS_1024_DIGITS_PATH + "city_temperature_f.csv", + PATHS.DATASETS_1024_EXCEPTIONS_PATH + "city_temperature_f.csv", + 14, + 13, // + 0, + 0, + 0, + 0, + 0, + 11}, // 3 works better than 14 + + {8, + "cms1", + PATHS.DATASETS_1024_SAMPLES_PATH + "cms1.csv", + PATHS.DATASETS_1024_DIGITS_PATH + "cms1.csv", + PATHS.DATASETS_1024_EXCEPTIONS_PATH + "cms1.csv", // + 14, + 5, + 10, + 0, + 0, + 0, + 0, + 41}, + + {9, + "cms9", + PATHS.DATASETS_1024_SAMPLES_PATH + "cms9.csv", + PATHS.DATASETS_1024_DIGITS_PATH + "cms9.csv", + PATHS.DATASETS_1024_EXCEPTIONS_PATH + "cms9.csv", // + 16, + 16, + 2, + 0, + 0, + 0, + 0, + 10}, + + {10, + "cms25", + PATHS.DATASETS_1024_SAMPLES_PATH + "cms25.csv", + PATHS.DATASETS_1024_DIGITS_PATH + "cms25.csv", + PATHS.DATASETS_1024_EXCEPTIONS_PATH + "cms25.csv", // + 14, + 4, + 6, + 0, + 0, + 0, + 0, + 42}, + + {11, + "food_prices", + PATHS.DATASETS_1024_SAMPLES_PATH + "food_prices.csv", + PATHS.DATASETS_1024_DIGITS_PATH + "food_prices.csv", + PATHS.DATASETS_1024_EXCEPTIONS_PATH + "food_prices.csv", // + 16, + 12, + 24, + 0, + 0, + 0, + 0, + 23}, + + {12, + "gov10", + PATHS.DATASETS_1024_SAMPLES_PATH + "gov10.csv", + PATHS.DATASETS_1024_DIGITS_PATH + "gov10.csv", + PATHS.DATASETS_1024_EXCEPTIONS_PATH + "gov10.csv", + 3, + 1, + 41, + 0, + 0, + 0, + 0, + 29}, + + {13, + "gov26", + PATHS.DATASETS_1024_SAMPLES_PATH + "gov26.csv", + PATHS.DATASETS_1024_DIGITS_PATH + "gov26.csv", + PATHS.DATASETS_1024_EXCEPTIONS_PATH + "gov26.csv", + 18, + 18, + 0, + 0, + 0, + 0, + 0, + 0}, + + {14, + "gov30", + PATHS.DATASETS_1024_SAMPLES_PATH + "gov30.csv", + PATHS.DATASETS_1024_DIGITS_PATH + "gov30.csv", + PATHS.DATASETS_1024_EXCEPTIONS_PATH + "gov30.csv", + 18, + 18, + 4, + 0, + 0, + 0, + 0, + 0}, + + {15, + "gov31", + PATHS.DATASETS_1024_SAMPLES_PATH + "gov31.csv", + PATHS.DATASETS_1024_DIGITS_PATH + "gov31.csv", + PATHS.DATASETS_1024_EXCEPTIONS_PATH + "gov31.csv", + 18, + 18, + 1, + 0, + 0, + 0, + 0, + 0}, + + {16, + "gov40", + PATHS.DATASETS_1024_SAMPLES_PATH + "gov40.csv", + PATHS.DATASETS_1024_DIGITS_PATH + "gov40.csv", + PATHS.DATASETS_1024_EXCEPTIONS_PATH + "gov40.csv", + 18, + 18, + 3, + 0, + 0, + 0, + 0, + 0}, + + {17, + "medicare1", + PATHS.DATASETS_1024_SAMPLES_PATH + "medicare1.csv", // + PATHS.DATASETS_1024_DIGITS_PATH + "medicare1.csv", + PATHS.DATASETS_1024_EXCEPTIONS_PATH + "medicare1.csv", + 14, + 5, + 37, + 0, + 0, + 0, + 0, + 38}, + + {18, + "medicare9", + PATHS.DATASETS_1024_SAMPLES_PATH + "medicare9.csv", // + PATHS.DATASETS_1024_DIGITS_PATH + "medicare9.csv", + PATHS.DATASETS_1024_EXCEPTIONS_PATH + "medicare9.csv", + 16, + 16, + 3, + 0, + 0, + 0, + 0, + 10}, + + {19, + "neon_air_pressure", + PATHS.DATASETS_1024_SAMPLES_PATH + "neon_air_pressure.csv", // + PATHS.DATASETS_1024_DIGITS_PATH + "neon_air_pressure.csv", + PATHS.DATASETS_1024_EXCEPTIONS_PATH + "neon_air_pressure.csv", + 14, + 9, + 3, + 0, + 0, + 0, + 0, + 16}, + + {20, + "neon_bio_temp_c", + PATHS.DATASETS_1024_SAMPLES_PATH + "neon_bio_temp_c.csv", // + PATHS.DATASETS_1024_DIGITS_PATH + "neon_bio_temp_c.csv", + PATHS.DATASETS_1024_EXCEPTIONS_PATH + "neon_bio_temp_c.csv", + 14, + 12, + 0, + 0, + 0, + 0, + 0, + 10}, + + {21, + "neon_dew_point_temp", + PATHS.DATASETS_1024_SAMPLES_PATH + "neon_dew_point_temp.csv", // + PATHS.DATASETS_1024_DIGITS_PATH + "neon_dew_point_temp.csv", + PATHS.DATASETS_1024_EXCEPTIONS_PATH + "neon_dew_point_temp.csv", + 14, + 11, + 6, + 0, + 0, + 0, + 0, + 13}, + + {22, + "neon_pm10_dust", + PATHS.DATASETS_1024_SAMPLES_PATH + "neon_pm10_dust.csv", + PATHS.DATASETS_1024_DIGITS_PATH + "neon_pm10_dust.csv", + PATHS.DATASETS_1024_EXCEPTIONS_PATH + "neon_pm10_dust.csv", // + 14, + 11, + 0, + 0, + 0, + 0, + 0, + 8}, + + {23, + "neon_wind_dir", + PATHS.DATASETS_1024_SAMPLES_PATH + "neon_wind_dir.csv", // + PATHS.DATASETS_1024_DIGITS_PATH + "neon_wind_dir.csv", + PATHS.DATASETS_1024_EXCEPTIONS_PATH + "neon_wind_dir.csv", + 14, + 12, + 0, + 0, + 0, + 0, + 0, + 16}, + + {24, + "nyc29", + PATHS.DATASETS_1024_SAMPLES_PATH + "nyc29.csv", + PATHS.DATASETS_1024_DIGITS_PATH + "nyc29.csv", + PATHS.DATASETS_1024_EXCEPTIONS_PATH + "nyc29.csv", // + 14, + 1, + 5, + 0, + 0, + 0, + 0, + 42}, + + {25, + "poi_lat", + PATHS.DATASETS_1024_SAMPLES_PATH + "poi_lat.csv", // + PATHS.DATASETS_1024_DIGITS_PATH + "poi_lat.csv", + PATHS.DATASETS_1024_EXCEPTIONS_PATH + "poi_lat.csv", + 16, + 0, + 157, + 0, + 0, + 0, + 0, + 55}, + + {26, + "poi_lon", + PATHS.DATASETS_1024_SAMPLES_PATH + "poi_lon.csv", + PATHS.DATASETS_1024_DIGITS_PATH + "poi_lon.csv", + PATHS.DATASETS_1024_EXCEPTIONS_PATH + "poi_lon.csv", // + 16, + 0, + 199, + 0, + 0, + 0, + 0, + 56}, + + {27, + "ssd_hdd_benchmarks_f", + PATHS.DATASETS_1024_SAMPLES_PATH + "ssd_hdd_benchmarks_f.csv", + PATHS.DATASETS_1024_DIGITS_PATH + "ssd_hdd_benchmarks_f.csv", + PATHS.DATASETS_1024_EXCEPTIONS_PATH + "ssd_hdd_benchmarks_f.csv", // + 14, + 13, + 0, + 0, + 0, + 0, + 0, + 17}, + + {28, + "stocks_de", + PATHS.DATASETS_1024_SAMPLES_PATH + "stocks_de.csv", // + PATHS.DATASETS_1024_DIGITS_PATH + "stocks_de.csv", + PATHS.DATASETS_1024_EXCEPTIONS_PATH + "stocks_de.csv", + 14, + 11, + 5, + 0, + 0, + 0, + 0, + 10}, + + {29, + "stocks_uk", + PATHS.DATASETS_1024_SAMPLES_PATH + "stocks_uk.csv", // + PATHS.DATASETS_1024_DIGITS_PATH + "stocks_uk.csv", + PATHS.DATASETS_1024_EXCEPTIONS_PATH + "stocks_uk.csv", + 14, + 13, + 0, + 0, + 0, + 0, + 0, + 9}, + + {30, + "stocks_usa_c", + PATHS.DATASETS_1024_SAMPLES_PATH + "stocks_usa_c.csv", // + PATHS.DATASETS_1024_DIGITS_PATH + "stocks_usa_c.csv", + PATHS.DATASETS_1024_EXCEPTIONS_PATH + "stocks_usa_c.csv", + 14, + 12, + 0, + 0, + 0, + 0, + 0, + 7}, + + {30, + "z_n", + PATHS.DATASETS_1024_SAMPLES_PATH + "z_n.csv", // + PATHS.DATASETS_1024_DIGITS_PATH + "z_n.csv", + PATHS.DATASETS_1024_EXCEPTIONS_PATH + "z_n.csv", + 14, + 12, + 0, + 0, + 0, + 0, + 0, + 7}, + +}; +} // namespace dataset +#endif \ No newline at end of file diff --git a/benchmarks/analyze_better_blocks/alp/manual/include/datasets_complete.hpp b/benchmarks/analyze_better_blocks/alp/manual/include/datasets_complete.hpp new file mode 100644 index 0000000..c79a6a1 --- /dev/null +++ b/benchmarks/analyze_better_blocks/alp/manual/include/datasets_complete.hpp @@ -0,0 +1,434 @@ +#ifndef DATASET_COMPLETE_HPP +#define DATASET_COMPLETE_HPP + +#include "dataset.hpp" +#include "string" +#include +#include + +namespace dataset { + +std::vector datasets_complete = { + {1, + "arade4", + PATHS.DATASETS_COMPLETE_PATH + "arade4.bin", // + PATHS.DATASETS_1024_DIGITS_PATH + "arade4.bin", + PATHS.DATASETS_1024_EXCEPTIONS_PATH + "arade4.bin", + 14, + 10, + 8, + 0, + 0, + 0, + 0, + 24}, + + {2, + "basel_temp_f", + PATHS.DATASETS_COMPLETE_PATH + "basel_temp_f.bin", + PATHS.DATASETS_1024_DIGITS_PATH + "basel_temp_f.bin", + PATHS.DATASETS_1024_EXCEPTIONS_PATH + "basel_temp_f.bin", + 14, + 7, // + 76, + 0, + 0, + 0, + 0, + 28}, // 3 works better than 14 + + {3, + "basel_wind_f", + PATHS.DATASETS_COMPLETE_PATH + "basel_wind_f.bin", + PATHS.DATASETS_1024_DIGITS_PATH + "basel_wind_f.bin", + PATHS.DATASETS_1024_EXCEPTIONS_PATH + "basel_wind_f.bin", + 14, + 7, // + 9, + 0, + 0, + 0, + 0, + 29}, // 3 works better than 14 + + {4, + "bird_migration_f", + PATHS.DATASETS_COMPLETE_PATH + "bird_migration_f.bin", + PATHS.DATASETS_1024_DIGITS_PATH + "bird_migration_f.bin", + PATHS.DATASETS_1024_EXCEPTIONS_PATH + "bird_migration_f.bin", + 16, + 11, // + 52, + 0, + 0, + 0, + 0, + 17}, // 3 works better than 14 + + {5, + "bitcoin_f", + PATHS.DATASETS_COMPLETE_PATH + "bitcoin_f.bin", + PATHS.DATASETS_1024_DIGITS_PATH + "bitcoin_f.bin", + PATHS.DATASETS_1024_EXCEPTIONS_PATH + "bitcoin_f.bin", + 13, + 9, // + 125, + 0, + 0, + 0, + 0, + 25}, // 3 works better than 14 + + {6, + "bitcoin_transactions_f", + PATHS.DATASETS_COMPLETE_PATH + "bitcoin_transactions_f.bin", + PATHS.DATASETS_1024_DIGITS_PATH + "bitcoin_transactions_f.bin", + PATHS.DATASETS_1024_EXCEPTIONS_PATH + "bitcoin_transactions_f.bin", // + 14, + 10, + 106, + 0, + 0, + 0, + 0, + 30}, + + {7, + "city_temperature_f", + PATHS.DATASETS_COMPLETE_PATH + "city_temperature_f.bin", + PATHS.DATASETS_1024_DIGITS_PATH + "city_temperature_f.bin", + PATHS.DATASETS_1024_EXCEPTIONS_PATH + "city_temperature_f.bin", + 14, + 13, // + 0, + 0, + 0, + 0, + 0, + 11}, // 3 works better than 14 + + {8, + "cms1", + PATHS.DATASETS_COMPLETE_PATH + "cms1.bin", + PATHS.DATASETS_1024_DIGITS_PATH + "cms1.bin", + PATHS.DATASETS_1024_EXCEPTIONS_PATH + "cms1.bin", // + 14, + 5, + 10, + 0, + 35.71, + 0, + 0, + 41}, + + {9, + "cms9", + PATHS.DATASETS_COMPLETE_PATH + "cms9.bin", + PATHS.DATASETS_1024_DIGITS_PATH + "cms9.bin", + PATHS.DATASETS_1024_EXCEPTIONS_PATH + "cms9.bin", // + 14, + 14, + 0, + 0, + 0, + 0, + 0, + 11}, + + {10, + "cms25", + PATHS.DATASETS_COMPLETE_PATH + "cms25.bin", + PATHS.DATASETS_1024_DIGITS_PATH + "cms25.bin", + PATHS.DATASETS_1024_EXCEPTIONS_PATH + "cms25.bin", // + 14, + 4, + 6, + 0, + 0, + 0, + 0, + 42}, + + {11, + "food_prices", + PATHS.DATASETS_COMPLETE_PATH + "food_prices.bin", + PATHS.DATASETS_1024_DIGITS_PATH + "food_prices.bin", + PATHS.DATASETS_1024_EXCEPTIONS_PATH + "food_prices.bin", // + 14, + 11, + 97, + 0, + 0, + 0, + 0, + 27}, + + {12, + "gov10", + PATHS.DATASETS_COMPLETE_PATH + "gov10.bin", + PATHS.DATASETS_1024_DIGITS_PATH + "gov10.bin", + PATHS.DATASETS_1024_EXCEPTIONS_PATH + "gov10.bin", + 13, + 11, + 109, + 0, + 0, + 0, + 0, + 27}, + + {13, + "gov26", + PATHS.DATASETS_COMPLETE_PATH + "gov26.bin", + PATHS.DATASETS_1024_DIGITS_PATH + "gov26.bin", + PATHS.DATASETS_1024_EXCEPTIONS_PATH + "gov26.bin", + 13, + 13, + 0, + 0, + 0, + 0, + 0, + 0}, + + {14, + "gov30", + PATHS.DATASETS_COMPLETE_PATH + "gov30.bin", + PATHS.DATASETS_1024_DIGITS_PATH + "gov30.bin", + PATHS.DATASETS_1024_EXCEPTIONS_PATH + "gov30.bin", + 13, + 13, + 51, + 0, + 0, + 0, + 0, + 19}, + + {15, + "gov31", + PATHS.DATASETS_COMPLETE_PATH + "gov31.bin", + PATHS.DATASETS_1024_DIGITS_PATH + "gov31.bin", + PATHS.DATASETS_1024_EXCEPTIONS_PATH + "gov31.bin", + 14, + 13, + 1, + 0, + 0, + 0, + 0, + 0}, + + {16, + "gov40", + PATHS.DATASETS_COMPLETE_PATH + "gov40.bin", + PATHS.DATASETS_1024_DIGITS_PATH + "gov40.bin", + PATHS.DATASETS_1024_EXCEPTIONS_PATH + "gov40.bin", + 13, + 13, + 0, + 0, + 0, + 0, + 0, + 19}, + + {17, + "medicare1", + PATHS.DATASETS_COMPLETE_PATH + "medicare1.bin", // + PATHS.DATASETS_1024_DIGITS_PATH + "medicare1.bin", + PATHS.DATASETS_1024_EXCEPTIONS_PATH + "medicare1.bin", + 14, + 12, + 255, + 0, + 39.39, + 0, + 0, + 15}, + + {18, + "medicare9", + PATHS.DATASETS_COMPLETE_PATH + "medicare9.bin", // + PATHS.DATASETS_1024_DIGITS_PATH + "medicare9.bin", + PATHS.DATASETS_1024_EXCEPTIONS_PATH + "medicare9.bin", + 14, + 14, + 0, + 0, + 0, + 0, + 0, + 11}, + + {19, + "neon_air_pressure", + PATHS.DATASETS_COMPLETE_PATH + "neon_air_pressure.bin", // + PATHS.DATASETS_1024_DIGITS_PATH + "neon_air_pressure.bin", + PATHS.DATASETS_1024_EXCEPTIONS_PATH + "neon_air_pressure.bin", + 16, + 11, + 172, + 0, + 0, + 0, + 0, + 16}, + + {20, + "neon_bio_temp_c", + PATHS.DATASETS_COMPLETE_PATH + "neon_bio_temp_c.bin", // + PATHS.DATASETS_1024_DIGITS_PATH + "neon_bio_temp_c.bin", + PATHS.DATASETS_1024_EXCEPTIONS_PATH + "neon_bio_temp_c.bin", + 14, + 12, + 0, + 0, + 0, + 0, + 0, + 10}, + + {21, + "neon_dew_point_temp", + PATHS.DATASETS_COMPLETE_PATH + "neon_dew_point_temp.bin", // + PATHS.DATASETS_1024_DIGITS_PATH + "neon_dew_point_temp.bin", + PATHS.DATASETS_1024_EXCEPTIONS_PATH + "neon_dew_point_temp.bin", + 14, + 11, + 6, + 0, + 0, + 0, + 0, + 13}, + + {22, + "neon_pm10_dust", + PATHS.DATASETS_COMPLETE_PATH + "neon_pm10_dust.bin", + PATHS.DATASETS_1024_DIGITS_PATH + "neon_pm10_dust.bin", + PATHS.DATASETS_1024_EXCEPTIONS_PATH + "neon_pm10_dust.bin", // + 14, + 11, + 0, + 0, + 0, + 0, + 0, + 8}, + + {23, + "neon_wind_dir", + PATHS.DATASETS_COMPLETE_PATH + "neon_wind_dir.bin", // + PATHS.DATASETS_1024_DIGITS_PATH + "neon_wind_dir.bin", + PATHS.DATASETS_1024_EXCEPTIONS_PATH + "neon_wind_dir.bin", + 14, + 12, + 0, + 0, + 0, + 0, + 0, + 16}, + + {24, + "nyc29", + PATHS.DATASETS_COMPLETE_PATH + "nyc29.bin", + PATHS.DATASETS_1024_DIGITS_PATH + "nyc29.bin", + PATHS.DATASETS_1024_EXCEPTIONS_PATH + "nyc29.bin", // + 14, + 1, + 92, + 0, + 40.38, + 0, + 0, + 42}, + + {25, + "poi_lat", + PATHS.DATASETS_COMPLETE_PATH + "poi_lat.bin", // + PATHS.DATASETS_1024_DIGITS_PATH + "poi_lat.bin", + PATHS.DATASETS_1024_EXCEPTIONS_PATH + "poi_lat.bin", + 16, + 0, + 58, + 0, + 0, + 0, + 0, + 55}, + + {26, + "poi_lon", + PATHS.DATASETS_COMPLETE_PATH + "poi_lon.bin", + PATHS.DATASETS_1024_DIGITS_PATH + "poi_lon.bin", + PATHS.DATASETS_1024_EXCEPTIONS_PATH + "poi_lon.bin", // + 16, + 0, + 48, + 0, + 0, + 0, + 0, + 56}, + + {27, + "ssd_hdd_benchmarks_f", + PATHS.DATASETS_COMPLETE_PATH + "ssd_hdd_benchmarks_f.bin", + PATHS.DATASETS_1024_DIGITS_PATH + "ssd_hdd_benchmarks_f.bin", + PATHS.DATASETS_1024_EXCEPTIONS_PATH + "ssd_hdd_benchmarks_f.bin", // + 14, + 13, + 0, + 0, + 0, + 0, + 0, + 17}, + + {28, + "stocks_de", + PATHS.DATASETS_COMPLETE_PATH + "stocks_de.bin", // + PATHS.DATASETS_1024_DIGITS_PATH + "stocks_de.bin", + PATHS.DATASETS_1024_EXCEPTIONS_PATH + "stocks_de.bin", + 14, + 11, + 5, + 0, + 0, + 0, + 0, + 10}, + + {29, + "stocks_uk", + PATHS.DATASETS_COMPLETE_PATH + "stocks_uk.bin", // + PATHS.DATASETS_1024_DIGITS_PATH + "stocks_uk.bin", + PATHS.DATASETS_1024_EXCEPTIONS_PATH + "stocks_uk.bin", + 14, + 13, + 0, + 0, + 0, + 0, + 0, + 9}, + + {30, + "stocks_usa_c", + PATHS.DATASETS_COMPLETE_PATH + "stocks_usa_c.bin", // + PATHS.DATASETS_1024_DIGITS_PATH + "stocks_usa_c.bin", + PATHS.DATASETS_1024_EXCEPTIONS_PATH + "stocks_usa_c.bin", + 14, + 11, + 0, + 0, + 0, + 0, + 0, + 10}, + +}; +} // namespace dataset +#endif \ No newline at end of file diff --git a/benchmarks/analyze_better_blocks/alp/manual/test_ped.cpp b/benchmarks/analyze_better_blocks/alp/manual/test_ped.cpp new file mode 100644 index 0000000..cbe0edd --- /dev/null +++ b/benchmarks/analyze_better_blocks/alp/manual/test_ped.cpp @@ -0,0 +1,163 @@ +#include "PerfEvent.hpp" +#include "Units.hpp" +#include "datablock/schemes/CScheme.hpp" +#include "datablock/schemes/CSchemePicker.hpp" +#include "datablock/schemes/CSchemePool.hpp" +#include "datablock/schemes/v2/double/Decimal.hpp" +#include "datablock/schemes/v2/double/DoubleBP.hpp" +#include "datablock/schemes/v2/double/DynamicDictionary.hpp" +#include "datablock/schemes/v2/double/Frequency.hpp" +#include "datablock/schemes/v2/double/RLE.hpp" +#include "datablock/schemes/v2/integer/PBP.hpp" +#include "gflags/gflags.h" +#include "include/datasets.hpp" +#include "include/datasets_complete.hpp" +#include "spdlog/fmt/bundled/ranges.h" +#include "spdlog/spdlog.h" +#include "gtest/gtest.h" +#include +#include +#include +#include + +// for some reason, this is only DECLARED in DynamicDictionary but not defined (breaks linking) +// and then DEFINED in every cpp file that uses it +DEFINE_string(fsst_stats, "", ""); +DEFINE_string(file_list_file, "pbi-double-columns.txt", "file-list"); +DEFINE_int32(cascade_depth, 1, "cascade"); + +class ped_test : public ::testing::Test { +public: + uint8_t* compressed_arr; + double* dbl_arr; + double* dec_dbl_arr; + + void SetUp() override { + dbl_arr = new double[1024]; + dec_dbl_arr = new double[1024 * 100]; + compressed_arr = new uint8_t[1024 * 1000000000]; + } + ~ped_test() override { + delete[] dbl_arr; + delete[] compressed_arr; + delete[] dec_dbl_arr; + } +}; +void setupSchemePool() { + using namespace cengine::db; + cengine::db::CSchemePool::refresh(); + auto& schemes = *cengine::db::CSchemePool::available_schemes; + + // for (auto& scheme : schemes.double_schemes) { + // std::cout << ConvertSchemeTypeToString(scheme.first) << std::endl; + // } + // for (auto& scheme : schemes.integer_schemes) { + // std::cout << ConvertSchemeTypeToString(scheme.first) << std::endl; + // } + + // double: DOUBLE_BP, UNCOMPRESSED, + for (auto it = schemes.double_schemes.begin(); it != schemes.double_schemes.end();) { + if (it->first != DoubleSchemeType::DOUBLE_BP // + && it->first != DoubleSchemeType::UNCOMPRESSED // + ) { + it = schemes.double_schemes.erase(it); + } else { + ++it; + } + } + + // int: X_FBP, UNCOMPRESSED, + for (auto it = schemes.integer_schemes.begin(); it != schemes.integer_schemes.end();) { + if (it->first != IntegerSchemeType::X_PBP // + && it->first != IntegerSchemeType::UNCOMPRESSED // + // && it->first != IntegerSchemeType::ONE_VALUE // + ) { + it = schemes.integer_schemes.erase(it); + } else { + ++it; + } + } +} + +TEST_F(ped_test, test_one_vector) { + setupSchemePool(); + cengine::db::v2::d::Decimal pd; + std::cout << pd.selfDescription() << std::endl; + for (auto& dataset : dataset::datasets) { + std::ifstream ifile(dataset.file_path, std::ios::in); + + if (ifile.fail()) { + std::cout << "ifile fails."; + std::exit(-1); + } + + // Read Data + double num = 0.0; + // keep storing values from the text file so long as data exists: + size_t c {0}; + while (ifile >> num) { + dbl_arr[c] = num; + c = c + 1; + } + + /* Init Encoding */ + size_t cascade = 2; + size_t output_bytes; + size_t size = 1024; + std::vector dst(size * 2, 0); + + cengine::db::DoubleStats stats(dbl_arr, nullptr, size); + stats = cengine::db::DoubleStats::generateStats(dbl_arr, nullptr, size); + + /* Encode */ + output_bytes = pd.compress(dbl_arr, nullptr, compressed_arr, stats, cascade); + +// std::cout << pd.fullDescription(compressed_arr) << std::endl; + + /* Init decoding. */ + + /* DECODE */ + pd.decompress(dst.data(), nullptr, compressed_arr, stats.tuple_count, cascade); + + /* Validate. */ + for (auto i = 0ul; i != size; ++i) { + ASSERT_EQ(dbl_arr[i], dst[i]); + } + std::cout << dataset.name << " : " << output_bytes / (1.0 * size * sizeof(double)) * 64 << std::endl; + } +} + +TEST_F(ped_test, test_all_dataset) { + setupSchemePool(); + cengine::db::v2::d::Decimal pd; + // std::cout << pd.selfDescription() << std::endl; + for (auto& dataset : dataset::datasets_complete) { + Vector doubles(dataset.file_path.c_str()); + + /* Init Encoding */ + size_t cascade = 2; + size_t output_bytes; + size_t size = doubles.size(); + std::cout << "size is : " << size << std::endl; + std::vector dst(size * 2, 0); + + cengine::db::DoubleStats stats(doubles.data, nullptr, size); + stats = cengine::db::DoubleStats::generateStats(doubles.data, nullptr, size); + + /* Encode */ + output_bytes = pd.compress(doubles.data, nullptr, compressed_arr, stats, cascade); + + std::cout << pd.fullDescription(compressed_arr) << std::endl; + + /* Init decoding. */ + + /* DECODE */ + pd.decompress(dst.data(), nullptr, compressed_arr, stats.tuple_count, cascade); + + /* Validate. */ + for (auto i = 0ul; i != size; ++i) { + ASSERT_EQ(doubles.data[i], dst[i]); + } + std::cout << dataset.name << " : " << output_bytes / (1.0 * size * sizeof(double)) * 64 << std::endl; + } +} \ No newline at end of file diff --git a/benchmarks/analyze_better_blocks/alp_bench/alp_bench.hpp b/benchmarks/analyze_better_blocks/alp_bench/alp_bench.hpp new file mode 100644 index 0000000..7fb117b --- /dev/null +++ b/benchmarks/analyze_better_blocks/alp_bench/alp_bench.hpp @@ -0,0 +1,2260 @@ +#ifndef FASTLANES_COMPRESSION_FLS_BENCH_FLS_BENCH_HPP +#define FASTLANES_COMPRESSION_FLS_BENCH_FLS_BENCH_HPP + +/* + * The M1 cycle counter is from Lemire repo. todo -> add the link + * The other parts are from google benchmark repo, edited heavily. todo -> add the link + */ +#include +#include +#include +#include +#include +#include // for errno +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include // for memset +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include // for ioctl +#include +#include // for syscall +#include +#include + +#if defined(__linux__) +#include // for __NR_perf_event_open +#include // for perf event constants +#endif +/*---------------------------------------------------------------------------------------------------------------------\ + * Macros: +\---------------------------------------------------------------------------------------------------------------------*/ +// The _MSVC_LANG check should detect Visual Studio 2015 Update 3 and newer. +#if __cplusplus >= 201103L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201103L) +#define BENCHMARK_HAS_CXX11 +#endif + +// This _MSC_VER check should detect VS 2017 v15.3 and newer. +#if __cplusplus >= 201703L || (defined(_MSC_VER) && _MSC_VER >= 1911 && _MSVC_LANG >= 201703L) +#define BENCHMARK_HAS_CXX17 +#endif + +#if defined(BENCHMARK_HAS_CXX11) +#include +#include +#include +#endif + +#if defined(_MSC_VER) +#include // for _ReadWriteBarrier +#endif + +#ifndef BENCHMARK_HAS_CXX11 +#define BENCHMARK_DISALLOW_COPY_AND_ASSIGN(TypeName) \ + TypeName(const TypeName&); \ + TypeName& operator=(const TypeName&) +#else +#define BENCHMARK_DISALLOW_COPY_AND_ASSIGN(TypeName) \ + TypeName(const TypeName&) = delete; \ + TypeName& operator=(const TypeName&) = delete +#endif + +#ifdef BENCHMARK_HAS_CXX17 +#define BENCHMARK_UNUSED FLS_BENCH_MAYBE_UNUSED +#elif defined(__GNUC__) || defined(__clang__) +#define BENCHMARK_UNUSED __attribute__((unused)) +#else +#define BENCHMARK_UNUSED +#endif + +#if defined(__GNUC__) || defined(__clang__) +#define BENCHMARK_ALWAYS_INLINE __attribute__((always_inline)) +#define BENCHMARK_NOEXCEPT noexcept +#define BENCHMARK_NOEXCEPT_OP(x) noexcept(x) +#elif defined(_MSC_VER) && !defined(__clang__) +#define BENCHMARK_ALWAYS_INLINE __forceinline +#if _MSC_VER >= 1900 +#define BENCHMARK_NOEXCEPT noexcept +#define BENCHMARK_NOEXCEPT_OP(x) noexcept(x) +#else +#define BENCHMARK_NOEXCEPT +#define BENCHMARK_NOEXCEPT_OP(x) +#endif +#define __func__ __FUNCTION__ +#else +#define BENCHMARK_ALWAYS_INLINE +#define BENCHMARK_NOEXCEPT +#define BENCHMARK_NOEXCEPT_OP(x) +#endif + +#define BENCHMARK_INTERNAL_TOSTRING2(x) #x +#define BENCHMARK_INTERNAL_TOSTRING(x) BENCHMARK_INTERNAL_TOSTRING2(x) + +#if defined(__GNUC__) || defined(__clang__) +#define BENCHMARK_BUILTIN_EXPECT(x, y) __builtin_expect(x, y) +#define BENCHMARK_DEPRECATED_MSG(msg) __attribute__((deprecated(msg))) +#else +#define BENCHMARK_BUILTIN_EXPECT(x, y) x +#define BENCHMARK_DEPRECATED_MSG(msg) +#define BENCHMARK_WARNING_MSG(msg) \ + __pragma(message(__FILE__ "(" BENCHMARK_INTERNAL_TOSTRING(__LINE__) ") : warning note: " msg)) +#endif + +#if defined(__GNUC__) && !defined(__clang__) +#define BENCHMARK_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) +#endif + +#ifndef __has_builtin +#define __has_builtin(x) 0 +#endif + +#if defined(__GNUC__) || __has_builtin(__builtin_unreachable) +#define BENCHMARK_UNREACHABLE() __builtin_unreachable() +#elif defined(_MSC_VER) +#define BENCHMARK_UNREACHABLE() __assume(false) +#else +#define BENCHMARK_UNREACHABLE() ((void)0) +#endif + +#ifdef BENCHMARK_HAS_CXX11 +#else +#endif + +// clang-format off + +#ifndef __has_feature + #define __has_feature(x) 0 +#endif + +#if defined(__clang__) + #if defined(__ibmxl__) + #if !defined(COMPILER_IBMXL) + #define COMPILER_IBMXL + #endif + #elif !defined(COMPILER_CLANG) + #define COMPILER_CLANG + #endif +#elif defined(_MSC_VER) + #if !defined(COMPILER_MSVC) + #define COMPILER_MSVC + #endif +#elif defined(__GNUC__) + #if !defined(COMPILER_GCC) + #define COMPILER_GCC + #endif +#endif + +#if __has_feature(cxx_attributes) + #define BENCHMARK_NORETURN [[noreturn]] +#elif defined(__GNUC__) + #define BENCHMARK_NORETURN __attribute__((noreturn)) +#elif defined(COMPILER_MSVC) + #define BENCHMARK_NORETURN __declspec(noreturn) +#else + #define BENCHMARK_NORETURN +#endif + +#if defined(__CYGWIN__) + #define BENCHMARK_OS_CYGWIN 1 +#elif defined(_WIN32) + #define BENCHMARK_OS_WINDOWS 1 + #if defined(__MINGW32__) + #define BENCHMARK_OS_MINGW 1 + #endif +#elif defined(__APPLE__) + #define BENCHMARK_OS_APPLE 1 + #include "TargetConditionals.h" + #if defined(TARGET_OS_MAC) + #define BENCHMARK_OS_MACOSX 1 + #if defined(TARGET_OS_IPHONE) + #define BENCHMARK_OS_IOS 1 + #endif + #endif +#elif defined(__FreeBSD__) + #define BENCHMARK_OS_FREEBSD 1 +#elif defined(__NetBSD__) + #define BENCHMARK_OS_NETBSD 1 +#elif defined(__OpenBSD__) + #define BENCHMARK_OS_OPENBSD 1 +#elif defined(__DragonFly__) + #define BENCHMARK_OS_DRAGONFLY 1 +#elif defined(__linux__) + #define BENCHMARK_OS_LINUX 1 +#elif defined(__native_client__) + #define BENCHMARK_OS_NACL 1 +#elif defined(__EMSCRIPTEN__) + #define BENCHMARK_OS_EMSCRIPTEN 1 +#elif defined(__rtems__) + #define BENCHMARK_OS_RTEMS 1 +#elif defined(__Fuchsia__) + #define BENCHMARK_OS_FUCHSIA 1 +#elif defined (__SVR4) && defined (__sun) + #define BENCHMARK_OS_SOLARIS 1 +#elif defined(__QNX__) + #define BENCHMARK_OS_QNX 1 +#elif defined(__MVS__) + #define BENCHMARK_OS_ZOS 1 +#endif + +#if defined(__ANDROID__) && defined(__GLIBCXX__) + #define BENCHMARK_STL_ANDROID_GNUSTL 1 +#endif + +#if !__has_feature(cxx_exceptions) && !defined(__cpp_exceptions) \ + && !defined(__EXCEPTIONS) + #define BENCHMARK_HAS_NO_EXCEPTIONS +#endif + +#if defined(COMPILER_CLANG) || defined(COMPILER_GCC) + #define FLS_BENCH_MAYBE_UNUSED __attribute__((unused)) +#else + #define BENCHMARK_MAYBE_UNUSED +#endif + +// clang-format on + +#ifdef BENCHMARK_OS_WINDOWS +#include +#undef StrCat // Don't let StrCat in string_util.h be renamed to lstrcatA +#include +#include +#include +#else +#include +#ifndef BENCHMARK_OS_FUCHSIA +#include +#endif +#include +#include // this header must be included before 'sys/sysctl.h' to avoid compilation error on FreeBSD +#include +#if defined BENCHMARK_OS_FREEBSD || defined BENCHMARK_OS_MACOSX || defined BENCHMARK_OS_NETBSD || \ + defined BENCHMARK_OS_OPENBSD || defined BENCHMARK_OS_DRAGONFLY +#define BENCHMARK_HAS_SYSCTL +#include +#endif +#endif +#if defined(BENCHMARK_OS_SOLARIS) +#include +#endif +#if defined(BENCHMARK_OS_QNX) +#include +#endif + +#if defined(__GNUC__) || defined(__clang__) +#define BENCHMARK_ALWAYS_INLINE __attribute__((always_inline)) +#define BENCHMARK_NOEXCEPT noexcept +#define BENCHMARK_NOEXCEPT_OP(x) noexcept(x) +#elif defined(_MSC_VER) && !defined(__clang__) +#define BENCHMARK_ALWAYS_INLINE __forceinline +#if _MSC_VER >= 1900 +#define BENCHMARK_NOEXCEPT noexcept +#define BENCHMARK_NOEXCEPT_OP(x) noexcept(x) +#else +#define BENCHMARK_NOEXCEPT +#define BENCHMARK_NOEXCEPT_OP(x) +#endif +#define __func__ __FUNCTION__ +#else +#define BENCHMARK_ALWAYS_INLINE +#define BENCHMARK_NOEXCEPT +#define BENCHMARK_NOEXCEPT_OP(x) +#endif + +#if defined(BENCHMARK_OS_MACOSX) +#include +#endif +// For MSVC, we want to use '_asm rdtsc' when possible (since it works +// with even ancient MSVC compilers), and when not possible the +// __rdtsc intrinsic, declared in . Unfortunately, in some +// environments, and have conflicting +// declarations of some other intrinsics, breaking compilation. +// Therefore, we simply declare __rdtsc ourselves. See also +// http://connect.microsoft.com/VisualStudio/feedback/details/262047 +#if defined(COMPILER_MSVC) && !defined(_M_IX86) && !defined(_M_ARM64) +extern "C" uint64_t __rdtsc(); +#pragma intrinsic(__rdtsc) +#endif + +#if !defined(BENCHMARK_OS_WINDOWS) || defined(BENCHMARK_OS_MINGW) +#include +#include +#endif + +#ifdef BENCHMARK_OS_EMSCRIPTEN +#include +#endif + +#ifdef __aarch64__ +#define KPERF_LIST \ + /* ret, name, params */ \ + F(int, kpc_get_counting, void) \ + F(int, kpc_force_all_ctrs_set, int) \ + F(int, kpc_set_counting, uint32_t) \ + F(int, kpc_set_thread_counting, uint32_t) \ + F(int, kpc_set_config, uint32_t, void*) \ + F(int, kpc_get_config, uint32_t, void*) \ + F(int, kpc_set_period, uint32_t, void*) \ + F(int, kpc_get_period, uint32_t, void*) \ + F(uint32_t, kpc_get_counter_count, uint32_t) \ + F(uint32_t, kpc_get_config_count, uint32_t) \ + F(int, kperf_sample_get, int*) \ + F(int, kpc_get_thread_counters, int, unsigned int, void*) + +#define F(ret, name, ...) \ + typedef ret name##proc(__VA_ARGS__); \ + static name##proc* name; +KPERF_LIST +#undef F + +#define CFGWORD_EL0A32EN_MASK (0x10000) +#define CFGWORD_EL0A64EN_MASK (0x20000) +#define CFGWORD_EL1EN_MASK (0x40000) +#define CFGWORD_EL3EN_MASK (0x80000) +#define CFGWORD_ALLMODES_MASK (0xf0000) + +#define CPMU_NONE 0 +#define CPMU_CORE_CYCLE 0x02 +#define CPMU_INST_A64 0x8c +#define CPMU_INST_BRANCH 0x8d +#define CPMU_SYNC_DC_LOAD_MISS 0xbf +#define CPMU_SYNC_DC_STORE_MISS 0xc0 +#define CPMU_SYNC_DTLB_MISS 0xc1 +#define CPMU_SYNC_ST_HIT_YNGR_LD 0xc4 +#define CPMU_SYNC_BR_ANY_MISP 0xcb +#define CPMU_FED_IC_MISS_DEM 0xd3 +#define CPMU_FED_ITLB_MISS 0xd4 + +#define KPC_CLASS_FIXED (0) +#define KPC_CLASS_CONFIGURABLE (1) +#define KPC_CLASS_POWER (2) +#define KPC_CLASS_RAWPMU (3) +#define KPC_CLASS_FIXED_MASK (1u << KPC_CLASS_FIXED) +#define KPC_CLASS_CONFIGURABLE_MASK (1u << KPC_CLASS_CONFIGURABLE) +#define KPC_CLASS_POWER_MASK (1u << KPC_CLASS_POWER) +#define KPC_CLASS_RAWPMU_MASK (1u << KPC_CLASS_RAWPMU) + +#define COUNTERS_COUNT 10 +#define CONFIG_COUNT 8 +#define KPC_MASK (KPC_CLASS_CONFIGURABLE_MASK | KPC_CLASS_FIXED_MASK) + +#endif + +#ifdef BENCHMARK_OS_WINDOWS +#include +#endif + +#ifdef BENCHMARK_OS_ZOS +#include +#endif + +#include +#ifdef BENCHMARK_STL_ANDROID_GNUSTL +#include +#endif + +#ifdef BENCHMARK_OS_WINDOWS +#include +#undef StrCat // Don't let StrCat in string_util.h be renamed to lstrcatA +#include +#include +#include +#else +#include +#ifndef BENCHMARK_OS_FUCHSIA +#include +#endif +#include +#include // this header must be included before 'sys/sysctl.h' to avoid compilation error on FreeBSD +#include +#if defined BENCHMARK_OS_FREEBSD || defined BENCHMARK_OS_MACOSX || defined BENCHMARK_OS_NETBSD || \ + defined BENCHMARK_OS_OPENBSD || defined BENCHMARK_OS_DRAGONFLY +#define BENCHMARK_HAS_SYSCTL +#include +#endif +#endif +#if defined(BENCHMARK_OS_SOLARIS) +#include +#endif +#if defined(BENCHMARK_OS_QNX) +#include +#endif + +#define SOURCE_DIR "${CMAKE_SOURCE_DIR}" +#define CMAKE_OSX_ARCHITECTURES "${CMAKE_OSX_ARCHITECTURES}" +#define CMAKE_HOST_SYSTEM_PROCESSOR "${CMAKE_HOST_SYSTEM_PROCESSOR}" +#define CMAKE_SYSTEM_PROCESSOR "${CMAKE_SYSTEM_PROCESSOR}" +#define CMAKE_HOST_SYSTEM_NAME "${CMAKE_HOST_SYSTEM_NAME}" +#define CMAKE_SYSTEM_NAME "${CMAKE_SYSTEM_NAME}" +#define CMAKE_C_COMPILER "${CMAKE_C_COMPILER}" +#define CMAKE_CXX_COMPILER "${CMAKE_CXX_COMPILER}" +#define CMAKE_CXX_COMPILER_ID "${CMAKE_CXX_COMPILER_ID}" +#define CMAKE_CXX_COMPILER_VERSION "${CMAKE_CXX_COMPILER_VERSION}" +#define CMAKE_CROSSCOMPILING "${CMAKE_CROSSCOMPILING}" +#define CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}" +#define CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}" +#define CMAKE_BUILD_TYPE "${CMAKE_BUILD_TYPE}" +#define CMAKE_TOOLCHAIN_FILE "${CMAKE_TOOLCHAIN_FILE}" + +#define TARGET_NAME "${TARGET_NAME}" +#define TARGET_COMPILE_OPTIONS "${TARGET_COMPILE_OPTIONS}" + +/*---------------------------------------------------------------------------------------------------------------------\ + * Lib: +\---------------------------------------------------------------------------------------------------------------------*/ +namespace benchmark { +/* From: https://github.com/WojciechMula/toys/blob/master/000helpers/linux-perf-events.h + * Now api has been added to be compatible with the rest of fls_bench. + */ +#if defined(__linuxp__) +namespace perf { +template +class LinuxEvents { + + int fd; + perf_event_attr attribs; + bool running; + +public: + LinuxEvents(int config) + : fd(0) { + memset(&attribs, 0, sizeof(attribs)); + attribs.type = TYPE; + attribs.size = sizeof(attribs); + attribs.config = config; + attribs.disabled = 1; + attribs.exclude_kernel = 1; + attribs.exclude_hv = 1; + + const int pid = 0; // the current process + const int cpu = -1; // all CPUs + const int group = -1; // no group + const unsigned long flags = 0; + fd = syscall(__NR_perf_event_open, &attribs, pid, cpu, group, flags); + if (fd == -1) { report_error("perf_event_open"); } + + running = false; + } + + ~LinuxEvents() { close(fd); } + + void start() { + if (ioctl(fd, PERF_EVENT_IOC_RESET, 0) == -1) { report_error("ioctl(PERF_EVENT_IOC_RESET)"); } + + if (ioctl(fd, PERF_EVENT_IOC_ENABLE, 0) == -1) { report_error("ioctl(PERF_EVENT_IOC_ENABLE)"); } + } + + unsigned long end() { + if (ioctl(fd, PERF_EVENT_IOC_DISABLE, 0) == -1) { report_error("ioctl(PERF_EVENT_IOC_DISABLE)"); } + + unsigned long result; + if (read(fd, &result, sizeof(result)) == -1) { report_error("read"); } + + return result; + } + + unsigned long now() { + if (!running) { + if (ioctl(fd, PERF_EVENT_IOC_RESET, 0) == -1) { report_error("ioctl(PERF_EVENT_IOC_RESET)"); } + + if (ioctl(fd, PERF_EVENT_IOC_ENABLE, 0) == -1) { report_error("ioctl(PERF_EVENT_IOC_ENABLE)"); } + + running = true; + return 0; + } else { + if (ioctl(fd, PERF_EVENT_IOC_DISABLE, 0) == -1) { report_error("ioctl(PERF_EVENT_IOC_DISABLE)"); } + + unsigned long result; + if (read(fd, &result, sizeof(result)) == -1) { report_error("read"); } + + running = false; + return result; + } + } + +private: + void report_error(const std::string& context) { + throw std::runtime_error(context + ": " + std::string(strerror(errno))); + } +}; + +} // namespace perf + +perf::LinuxEvents cycles(PERF_COUNT_HW_CPU_CYCLES); +#endif + +// NOTE: only i386 and x86_64 have been well tested. +// PPC, sparc, alpha, and ia64 are based on +// http://peter.kuscsik.com/wordpress/?p=14 +// with modifications by m3b. See also +// https://setisvn.ssl.berkeley.edu/svn/lib/fftw-3.0.1/kernel/cycle.h +namespace cycleclock { + +#if defined(__aarch64__) +#if defined(__APPLE__) +static uint64_t g_counters[COUNTERS_COUNT]; +static uint64_t g_config[COUNTERS_COUNT]; +#endif +#endif + +FLS_BENCH_MAYBE_UNUSED static uint64_t get_counters() { +#if defined(__aarch64__) +#if defined(__APPLE__) + static bool WARNED = false; + if (kpc_get_thread_counters(0, COUNTERS_COUNT, g_counters)) { + if (!WARNED) { + printf("kpc_get_thread_counters failed, run as sudo?\n"); + WARNED = true; + } + return 1; + } + // g_counters[3 + 2] gives you the number of instructions 'decoded' + // whereas g_counters[1] might give you the number of instructions 'retired'. + return g_counters[0 + 2]; +#endif +#endif + return 0; +} + +FLS_BENCH_MAYBE_UNUSED static void configure_rdtsc() { +#if defined(__aarch64__) +#if defined(__APPLE__) + if (kpc_set_config(KPC_MASK, g_config)) { + printf("kpc_set_config failed\n"); + return; + } + + if (kpc_force_all_ctrs_set(1)) { + printf("kpc_force_all_ctrs_set failed\n"); + return; + } + + if (kpc_set_counting(KPC_MASK)) { + printf("kpc_set_counting failed\n"); + return; + } + + if (kpc_set_thread_counting(KPC_MASK)) { + printf("kpc_set_thread_counting failed\n"); + return; + } +#endif +#endif +} + +static void Init() { +#if defined(__aarch64__) +#if defined(__APPLE__) + void* kperf = dlopen("/System/Library/PrivateFrameworks/kperf.framework/Versions/A/kperf", RTLD_LAZY); + if (!kperf) { + printf("kperf = %p\n", kperf); + return; + } +#define F(ret, name, ...) \ + name = (name##proc*)(dlsym(kperf, #name)); \ + if (!name) { \ + printf("%s = %p\n", #name, (void*)name); \ + return; \ + } + KPERF_LIST +#undef F + + if (kpc_get_counter_count(KPC_MASK) != COUNTERS_COUNT) { + printf("wrong fixed counters count\n"); + return; + } + + if (kpc_get_config_count(KPC_MASK) != CONFIG_COUNT) { + printf("wrong fixed config count\n"); + return; + } + g_config[0] = CPMU_CORE_CYCLE | CFGWORD_EL0A64EN_MASK; + g_config[3] = CPMU_INST_BRANCH | CFGWORD_EL0A64EN_MASK; + g_config[4] = CPMU_SYNC_BR_ANY_MISP | CFGWORD_EL0A64EN_MASK; + g_config[5] = CPMU_INST_A64 | CFGWORD_EL0A64EN_MASK; + + configure_rdtsc(); +#endif +#endif +} +static uint64_t get_counters(); +// This should return the number of cycles since power-on. Thread-safe. +inline BENCHMARK_ALWAYS_INLINE int64_t Now() { + // #if defined(BENCHMARK_OS_MACOSX) + // // this goes at the top because we need ALL Macs, regardless of + // // architecture, to return the number of "mach time units" that + // // have passed since startup. See sysinfo.cc where + // // InitializeSystemInfo() sets the supposed cpu clock frequency of + // // macs to the number of mach time units per second, not actual + // // CPU clock frequency (which can change in the face of CPU + // // frequency scaling). Also note that when the Mac sleeps, this + // // counter pauses; it does not continue counting, nor does it + // // reset to zero. + // return mach_absolute_time(); + // #el +#if defined(BENCHMARK_OS_EMSCRIPTEN) + // this goes above x86-specific code because old versions of Emscripten + // define __x86_64__, although they have nothing to do with it. + // return static_cast(emscripten_get_now() * 1e+6); + + return std::chrono::high_resolution_clock::now().time_since_epoch().count(); +#elif defined(__i386__) + int64_t ret; + __asm__ volatile("rdtsc" : "=A"(ret)); + return ret; +#elif defined(__x86_64__) || defined(__amd64__) + uint64_t low, high; + __asm__ volatile("rdtsc" : "=a"(low), "=d"(high)); + return (high << 32) | low; +#elif defined(__powerpc__) || defined(__ppc__) + // This returns a time-base, which is not always precisely a cycle-count. +#if defined(__powerpc64__) || defined(__ppc64__) + int64_t tb; + asm volatile("mfspr %0, 268" : "=r"(tb)); + return tb; +#else + uint32_t tbl, tbu0, tbu1; + asm volatile("mftbu %0\n" + "mftb %1\n" + "mftbu %2" + : "=r"(tbu0), "=r"(tbl), "=r"(tbu1)); + tbl &= -static_cast(tbu0 == tbu1); + // high 32 bits in tbu1; low 32 bits in tbl (tbu0 is no longer needed) + return (static_cast(tbu1) << 32) | tbl; +#endif +#elif defined(__sparc__) + int64_t tick; + asm(".byte 0x83, 0x41, 0x00, 0x00"); + asm("mov %%g1, %0" : "=r"(tick)); + return tick; +#elif defined(__ia64__) + int64_t itc; + asm("mov %0 = ar.itc" : "=r"(itc)); + return itc; +#elif defined(COMPILER_MSVC) && defined(_M_IX86) + // Older MSVC compilers (like 7.x) don't seem to support the + // __rdtsc intrinsic properly, so I prefer to use _asm instead + // when I know it will work. Otherwise, I'll use __rdtsc and hope + // the code is being compiled with a non-ancient compiler. + _asm rdtsc +#elif defined(COMPILER_MSVC) && defined(_M_ARM64) + // See + // https://docs.microsoft.com/en-us/cpp/intrinsics/arm64-intrinsics?view=vs-2019 + // and https://reviews.llvm.org/D53115 + int64_t virtual_timer_value; + virtual_timer_value = _ReadStatusReg(ARM64_CNTVCT); + return virtual_timer_value; +#elif defined(COMPILER_MSVC) + return __rdtsc(); +#elif defined(BENCHMARK_OS_NACL) + // Native Client validator on x86/x86-64 allows RDTSC instructions, + // and this case is handled above. Native Client validator on ARM + // rejects MRC instructions (used in the ARM-specific sequence below), + // so we handle it here. Portable Native Client compiles to + // architecture-agnostic bytecode, which doesn't provide any + // cycle counter access mnemonics. + + // Native Client does not provide any API to access cycle counter. + // Use clock_gettime(CLOCK_MONOTONIC, ...) instead of gettimeofday + // because is provides nanosecond resolution (which is noticable at + // least for PNaCl modules running on x86 Mac & Linux). + // Initialize to always return 0 if clock_gettime fails. + struct timespec ts = {0, 0}; + clock_gettime(CLOCK_MONOTONIC, &ts); + return static_cast(ts.tv_sec) * 1000000000 + ts.tv_nsec; +#elif defined(__aarch64__) + // System timer of ARMv8 runs at a different frequency than the CPU's. + // The frequency is fixed, typically in the range 1-50MHz. It can be + // read at CNTFRQ special register. We assume the OS has set up + // the virtual timer properly. + // int64_t virtual_timer_value; + // asm volatile("mrs %0, cntvct_el0" : "=r"(virtual_timer_value)); + // return virtual_timer_value; +#if defined(__APPLE__) + return get_counters(); +#else + return cycles.now(); +#endif + +#elif defined(__ARM_ARCH) + // V6 is the earliest arch that has a standard cyclecount + // Native Client validator doesn't allow MRC instructions. +#if (__ARM_ARCH >= 6) + uint32_t pmccntr; + uint32_t pmuseren; + uint32_t pmcntenset; + // Read the user mode perf monitor counter access permissions. + asm volatile("mrc p15, 0, %0, c9, c14, 0" : "=r"(pmuseren)); + if (pmuseren & 1) // Allows reading perfmon counters for user mode code. + { + asm volatile("mrc p15, 0, %0, c9, c12, 1" : "=r"(pmcntenset)); + if (pmcntenset & 0x80000000ul) // Is it counting? + { + asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r"(pmccntr)); + // The counter is set up to count every 64th cycle + return static_cast(pmccntr) * 64; // Should optimize to << 6 + } + } +#endif + struct timeval tv; + gettimeofday(&tv, nullptr); + return static_cast(tv.tv_sec) * 1000000 + tv.tv_usec; +#elif defined(__mips__) || defined(__m68k__) + // mips apparently only allows rdtsc for superusers, so we fall + // back to gettimeofday. It's possible clock_gettime would be better. + struct timeval tv; + gettimeofday(&tv, nullptr); + return static_cast(tv.tv_sec) * 1000000 + tv.tv_usec; +#elif defined(__s390__) // Covers both s390 and s390x. + // Return the CPU clock. + uint64_t tsc; +#if defined(BENCHMARK_OS_ZOS) && defined(COMPILER_IBMXL) + // z/OS XL compiler HLASM syntax. + asm(" stck %0" : "=m"(tsc) : : "cc"); +#else + asm("stck %0" : "=Q"(tsc) : : "cc"); +#endif + return tsc; +#elif defined(__riscv) // RISC-V + // Use RDCYCLE (and RDCYCLEH on riscv32) +#if __riscv_xlen == 32 + uint32_t cycles_lo, cycles_hi0, cycles_hi1; + // This asm also includes the PowerPC overflow handling strategy, as above. + // Implemented in assembly because Clang insisted on branching. + asm volatile("rdcycleh %0\n" + "rdcycle %1\n" + "rdcycleh %2\n" + "sub %0, %0, %2\n" + "seqz %0, %0\n" + "sub %0, zero, %0\n" + "and %1, %1, %0\n" + : "=r"(cycles_hi0), "=r"(cycles_lo), "=r"(cycles_hi1)); + return (static_cast(cycles_hi1) << 32) | cycles_lo; +#else + uint64_t cycles; + asm volatile("rdcycle %0" : "=r"(cycles)); + return cycles; +#endif +#elif defined(__e2k__) || defined(__elbrus__) + struct timeval tv; + gettimeofday(&tv, nullptr); + return static_cast(tv.tv_sec) * 1000000 + tv.tv_usec; +#else + // The soft failover to a generic implementation is automatic only for ARM. + // For other platforms the developer is expected to make an attempt to create + // a fast implementation and use generic version if nothing better is available. +#error You need to define CycleTimer for your OS and CPU + // return + // std::chrono::high_resolution_clock::now().time_since_epoch().count(); + +#endif +} +} // end namespace cycleclock + +namespace timer { +inline uint64_t Now() { return std::chrono::high_resolution_clock::now().time_since_epoch().count(); } +} // namespace timer + +const int kNumMillisPerSecond = 1000; +const int kNumMicrosPerMilli = 1000; +const int kNumMicrosPerSecond = kNumMillisPerSecond * 1000; +const int kNumNanosPerMicro = 1000; +const int kNumNanosPerSecond = kNumNanosPerMicro * kNumMicrosPerSecond; + +#ifdef BENCHMARK_OS_WINDOWS +// Window's Sleep takes milliseconds argument. +void SleepForMilliseconds(int milliseconds) { Sleep(milliseconds); } +void SleepForSeconds(double seconds) { SleepForMilliseconds(static_cast(kNumMillisPerSecond * seconds)); } +#else // BENCHMARK_OS_WINDOWS +static void SleepForMicroseconds(int microseconds) { +#ifdef BENCHMARK_OS_ZOS + // z/OS does not support nanosleep. Instead call sleep() and then usleep() to + // sleep for the remaining microseconds because usleep() will fail if its + // argument is greater than 1000000. + div_t sleepTime = div(microseconds, kNumMicrosPerSecond); + int seconds = sleepTime.quot; + while (seconds != 0) { + seconds = sleep(seconds); + } + while (usleep(sleepTime.rem) == -1 && errno == EINTR) + ; +#else + struct timespec sleep_time; + sleep_time.tv_sec = microseconds / kNumMicrosPerSecond; + sleep_time.tv_nsec = (microseconds % kNumMicrosPerSecond) * kNumNanosPerMicro; + while (nanosleep(&sleep_time, &sleep_time) != 0 && errno == EINTR) + ; // Ignore signals and wait for the full interval to elapse. +#endif +} + +static void SleepForMilliseconds(int milliseconds) { SleepForMicroseconds(milliseconds * kNumMicrosPerMilli); } + +FLS_BENCH_MAYBE_UNUSED static void SleepForSeconds(double seconds) { + SleepForMicroseconds(static_cast(seconds * kNumMicrosPerSecond)); +} +#endif // BENCHMARK_OS_WINDOWS + +namespace internal { +// The arraysize(arr) macro returns the # of elements in an array arr. +// The expression is a compile-time constant, and therefore can be +// used in defining new arrays, for example. If you use arraysize on +// a pointer by mistake, you will get a compile-time error. +// + +// This template function declaration is used in defining arraysize. +// Note that the function doesn't need an implementation, as we only +// use its type. +template +char (&ArraySizeHelper(T (&array)[N]))[N]; + +// That gcc wants both of these prototypes seems mysterious. VC, for +// its part, can't decide which to use (another mystery). Matching of +// template overloads: the final frontier. +#ifndef COMPILER_MSVC +template +char (&ArraySizeHelper(const T (&array)[N]))[N]; +#endif + +#define arraysize(array) (sizeof(::benchmark::internal::ArraySizeHelper(array))) + +} // namespace internal + +// kilo, Mega, Giga, Tera, Peta, Exa, Zetta, Yotta. +const char kBigSIUnits[] = "kMGTPEZY"; +// Kibi, Mebi, Gibi, Tebi, Pebi, Exbi, Zebi, Yobi. +const char kBigIECUnits[] = "KMGTPEZY"; +// milli, micro, nano, pico, femto, atto, zepto, yocto. +const char kSmallSIUnits[] = "munpfazy"; + +// We require that all three arrays have the same size. +static_assert(arraysize(kBigSIUnits) == arraysize(kBigIECUnits), "SI and IEC unit arrays must be the same size"); +static_assert(arraysize(kSmallSIUnits) == arraysize(kBigSIUnits), + "Small SI and Big SI unit arrays must be the same size"); + +static const int64_t kUnitsSize = arraysize(kBigSIUnits); + +static void ToExponentAndMantissa( + double val, double thresh, int precision, double one_k, std::string* mantissa, int64_t* exponent) { + std::stringstream mantissa_stream; + + if (val < 0) { + mantissa_stream << "-"; + val = -val; + } + + // Adjust threshold so that it never excludes things which can't be rendered + // in 'precision' digits. + const double adjusted_threshold = std::max(thresh, 1.0 / std::pow(10.0, precision)); + const double big_threshold = adjusted_threshold * one_k; + const double small_threshold = adjusted_threshold; + // Values in ]simple_threshold,small_threshold[ will be printed as-is + const double simple_threshold = 0.01; + + if (val > big_threshold) { + // Positive powers + double scaled = val; + for (size_t i = 0; i < arraysize(kBigSIUnits); ++i) { + scaled /= one_k; + if (scaled <= big_threshold) { + mantissa_stream << scaled; + *exponent = i + 1; + *mantissa = mantissa_stream.str(); + return; + } + } + mantissa_stream << val; + *exponent = 0; + } else if (val < small_threshold) { + // Negative powers + if (val < simple_threshold) { + double scaled = val; + for (size_t i = 0; i < arraysize(kSmallSIUnits); ++i) { + scaled *= one_k; + if (scaled >= small_threshold) { + mantissa_stream << scaled; + *exponent = -static_cast(i + 1); + *mantissa = mantissa_stream.str(); + return; + } + } + } + mantissa_stream << val; + *exponent = 0; + } else { + mantissa_stream << val; + *exponent = 0; + } + *mantissa = mantissa_stream.str(); +} + +static std::string ExponentToPrefix(int64_t exponent, bool iec) { + if (exponent == 0) { return ""; } + + const int64_t index = (exponent > 0 ? exponent - 1 : -exponent - 1); + if (index >= kUnitsSize) { return ""; } + + const char* array = (exponent > 0 ? (iec ? kBigIECUnits : kBigSIUnits) : kSmallSIUnits); + if (iec) { + return array[index] + std::string("i"); + } else { + return std::string(1, array[index]); + } +} + +static std::string ToBinaryStringFullySpecified(double value, double threshold, int precision, double one_k = 1024.0) { + std::string mantissa; + int64_t exponent; + ToExponentAndMantissa(value, threshold, precision, one_k, &mantissa, &exponent); + return mantissa + ExponentToPrefix(exponent, false); +} + +FLS_BENCH_MAYBE_UNUSED static void AppendHumanReadable(int n, std::string* str) { + std::stringstream ss; + // Round down to the nearest SI prefix. + ss << ToBinaryStringFullySpecified(n, 1.0, 0); + *str += ss.str(); +} + +FLS_BENCH_MAYBE_UNUSED static std::string HumanReadableNumber(double n, double one_k = 1024.0) { + // 1.1 means that figures up to 1.1k should be shown with the next unit down; + // this softens edge effects. + // 1 means that we should show one decimal place of precision. + return ToBinaryStringFullySpecified(n, 1.1, 1, one_k); +} + +static std::string StrFormatImp(const char* msg, va_list args) { + // we might need a second shot at this, so pre-emptivly make a copy + va_list args_cp; + va_copy(args_cp, args); + + // TODO(ericwf): use std::array for first attempt to avoid one memory + // allocation guess what the size might be + std::array local_buff; + std::size_t size = local_buff.size(); + // 2015-10-08: vsnprintf is used instead of snd::vsnprintf due to a limitation + // in the android-ndk + auto ret = vsnprintf(local_buff.data(), size, msg, args_cp); + + va_end(args_cp); + + // handle empty expansion + if (ret == 0) return std::string {}; + if (static_cast(ret) < size) { return std::string(local_buff.data()); } + + // we did not provide a long enough buffer on our first attempt. + // add 1 to size to account for null-byte in size cast to prevent overflow + size = static_cast(ret) + 1; + auto buff_ptr = std::unique_ptr(new char[size]); + // 2015-10-08: vsnprintf is used instead of snd::vsnprintf due to a limitation + // in the android-ndk + ret = vsnprintf(buff_ptr.get(), size, msg, args); + return std::string(buff_ptr.get()); +} + +#if defined(__MINGW32__) +__attribute__((format(__MINGW_PRINTF_FORMAT, 1, 2))) +#elif defined(__GNUC__) +__attribute__((format(printf, 1, 2))) +#endif + +static std::string +StrFormat(const char* format, ...) { + va_list args; + va_start(args, format); + std::string tmp = StrFormatImp(format, args); + va_end(args); + return tmp; +} + +inline std::ostream& StrCatImp(std::ostream& out) { return out; } + +template +inline std::ostream& StrCatImp(std::ostream& out, First&& f, Rest&&... rest) { + out << std::forward(f); + return StrCatImp(out, std::forward(rest)...); +} + +template +inline std::string StrCat(Args&&... args) { + std::ostringstream ss; + StrCatImp(ss, std::forward(args)...); + return ss.str(); +} + +std::vector StrSplit(const std::string& str, char delim); + +#ifdef BENCHMARK_STL_ANDROID_GNUSTL +/* + * GNU STL in Android NDK lacks support for some C++11 functions, including + * stoul, stoi, stod. We reimplement them here using C functions strtoul, + * strtol, strtod. Note that reimplemented functions are in benchmark:: + * namespace, not std:: namespace. + */ +unsigned long stoul(const std::string& str, size_t* pos = nullptr, int base = 10); +int stoi(const std::string& str, size_t* pos = nullptr, int base = 10); +double stod(const std::string& str, size_t* pos = nullptr); +#else +using std::stod; +using std::stoi; +using std::stoul; +#endif + +class CPUInfo { +public: + struct CacheInfo { + std::string type; + int level; + int size; + int num_sharing; + }; + + enum Scaling { UNKNOWN, ENABLED, DISABLED }; + + static inline std::string ToString(Scaling v) { + switch (v) { + case UNKNOWN: + return "UNKNOWN"; + case ENABLED: + return "ENABLED"; + case DISABLED: + return "DISABLED"; + default: + return "UNKNOWN"; + } + } + + // Getters + static const CPUInfo& getInstance() { + static const CPUInfo info; + return info; + } + int getNumCpus() const { return num_cpus; }; + double getCyclesPerSecond() const { return cycles_per_second; }; + const std::vector& getCaches() const { return caches; }; + const std::vector& getLoadAvg() const { return load_avg; }; + std::string getScaling() const { return ToString(scaling); }; + + int num_cpus; + Scaling scaling; + double cycles_per_second; + std::vector caches; + std::vector load_avg; + +private: + // private constructor + CPUInfo(); + + BENCHMARK_DISALLOW_COPY_AND_ASSIGN(CPUInfo); +}; + +static void PrintImp(std::ostream& out) { out << std::endl; } + +template +void PrintImp(std::ostream& out, First&& f, Rest&&... rest) { + out << std::forward(f); + PrintImp(out, std::forward(rest)...); +} + +template +BENCHMARK_NORETURN void PrintErrorAndDie(ARGS&&... args) { + PrintImp(std::cerr, std::forward(args)...); + std::exit(EXIT_FAILURE); +} + +#ifdef BENCHMARK_HAS_SYSCTL + +/// ValueUnion - A type used to correctly alias the byte-for-byte output of +/// `sysctl` with the result type it's to be interpreted as. +struct ValueUnion { + union DataT { + uint32_t uint32_value; + uint64_t uint64_value; + // For correct aliasing of union members from bytes. + char bytes[8]; + }; + using DataPtr = std::unique_ptr; + + // The size of the data union member + its trailing array size. + size_t Size; + DataPtr Buff; + +public: + ValueUnion() + : Size(0) + , Buff(nullptr, &std::free) {} + + explicit ValueUnion(size_t buff_size) + : Size(sizeof(DataT) + buff_size) + , Buff(::new(std::malloc(Size)) DataT(), &std::free) {} + + ValueUnion(ValueUnion&& other) = default; + explicit operator bool() const { return bool(Buff); } + char* data() const { return Buff->bytes; } + std::string GetAsString() const { return {data()}; } + int64_t GetAsInteger() const { + if (Size == sizeof(Buff->uint32_value)) { + return static_cast(Buff->uint32_value); + } else if (Size == sizeof(Buff->uint64_value)) { + return static_cast(Buff->uint64_value); + } + BENCHMARK_UNREACHABLE(); + } + uint64_t GetAsUnsigned() const { + if (Size == sizeof(Buff->uint32_value)) { + return Buff->uint32_value; + } else if (Size == sizeof(Buff->uint64_value)) { + return Buff->uint64_value; + } + BENCHMARK_UNREACHABLE(); + } + template + std::array GetAsArray() { + const int ArrSize = sizeof(T) * N; + // CHECK_LE(ArrSize, Size); + std::array arr; + std::memcpy(arr.data(), data(), ArrSize); + return arr; + } +}; + +static ValueUnion GetSysctlImp(std::string const& name) { +#if defined BENCHMARK_OS_OPENBSD + int mib[2]; + + mib[0] = CTL_HW; + if ((Name == "hw.ncpu") || (Name == "hw.cpuspeed")) { + ValueUnion buff(sizeof(int)); + + if (Name == "hw.ncpu") { + mib[1] = HW_NCPU; + } else { + mib[1] = HW_CPUSPEED; + } + + if (sysctl(mib, 2, buff.data(), &buff.Size, nullptr, 0) == -1) { return ValueUnion(); } + return buff; + } + return ValueUnion(); +#else + size_t cur_buff_size = 0; + if (sysctlbyname(name.c_str(), nullptr, &cur_buff_size, nullptr, 0) == -1) { return {}; } + + ValueUnion buff(cur_buff_size); + if (sysctlbyname(name.c_str(), buff.data(), &buff.Size, nullptr, 0) == 0) { return buff; } + return {}; +#endif +} + +FLS_BENCH_MAYBE_UNUSED static bool GetSysctl(std::string const& name, std::string* out) { + out->clear(); + auto buff = GetSysctlImp(name); + if (!buff) { return false; } + out->assign(buff.data()); + return true; +} + +template ::value>::type> +bool GetSysctl(std::string const& name, TP* Out) { + *Out = 0; + auto buff = GetSysctlImp(name); + if (!buff) { return false; } + *Out = static_cast(buff.GetAsUnsigned()); + return true; +} + +template +bool GetSysctl(std::string const& name, std::array* Out) { + auto buff = GetSysctlImp(name); + if (!buff) { return false; } + *Out = buff.GetAsArray(); + return true; +} +#endif + +template +bool ReadFromFile(std::string const& fname, ARG_T* arg) { + *arg = ARG_T(); + std::ifstream f(fname.c_str()); + if (!f.is_open()) { return false; } + f >> *arg; + return f.good(); +} + +static CPUInfo::Scaling CpuScaling(int num_cpus) { + // We don't have a valid CPU count, so don't even bother. + if (num_cpus <= 0) { return CPUInfo::Scaling::UNKNOWN; } +#ifdef BENCHMARK_OS_QNX + return CPUInfo::Scaling::UNKNOWN; +#endif +#ifndef BENCHMARK_OS_WINDOWS + // On Linux, the CPUfreq subsystem exposes CPU information as files on the + // local file system. If reading the exported files fails, then we may not be + // running on Linux, so we silently ignore all the read errors. + std::string res; + for (int cpu = 0; cpu < num_cpus; ++cpu) { + std::string governor_file = StrCat("/sys/devices/system/cpu/cpu", cpu, "/cpufreq/scaling_governor"); + if (ReadFromFile(governor_file, &res) && res != "performance") { return CPUInfo::Scaling::ENABLED; } + } + return CPUInfo::Scaling::DISABLED; +#endif + return CPUInfo::Scaling::UNKNOWN; +} + +static int CountSetBitsInCPUMap(std::string val) { + auto count_bits = [](std::string Part) { + using CPUMask = std::bitset; + Part = "0x" + Part; + CPUMask mask(benchmark::stoul(Part, nullptr, 16)); + return static_cast(mask.count()); + }; + size_t pos; + int total = 0; + while ((pos = val.find(',')) != std::string::npos) { + total += count_bits(val.substr(0, pos)); + val = val.substr(pos + 1); + } + if (!val.empty()) { total += count_bits(val); } + return total; +} + +FLS_BENCH_MAYBE_UNUSED +static std::vector GetCacheSizesFromKVFS() { + std::vector res; + std::string dir = "/sys/devices/system/cpu/cpu0/cache/"; + int idx = 0; + while (true) { + CPUInfo::CacheInfo info; + std::string f_path = StrCat(dir, "index", idx++, "/"); + std::ifstream f(StrCat(f_path, "size").c_str()); + if (!f.is_open()) { break; } + std::string suffix; + f >> info.size; + if (f.fail()) { PrintErrorAndDie("Failed while reading file '", f_path, "size'"); } + if (f.good()) { + f >> suffix; + if (f.bad()) { + PrintErrorAndDie("Invalid cache size format: failed to read size suffix"); + } else if (f && suffix != "K") { + PrintErrorAndDie("Invalid cache size format: Expected bytes ", suffix); + } else if (suffix == "K") { + info.size *= 1024; + } + } + if (!ReadFromFile(StrCat(f_path, "type"), &info.type)) { + PrintErrorAndDie("Failed to read from file ", f_path, "type"); + } + if (!ReadFromFile(StrCat(f_path, "level"), &info.level)) { + PrintErrorAndDie("Failed to read from file ", f_path, "level"); + } + std::string map_str; + if (!ReadFromFile(StrCat(f_path, "shared_cpu_map"), &map_str)) { + PrintErrorAndDie("Failed to read from file ", f_path, "shared_cpu_map"); + } + info.num_sharing = CountSetBitsInCPUMap(map_str); + res.push_back(info); + } + + return res; +} + +#ifdef BENCHMARK_OS_MACOSX +std::vector GetCacheSizesMacOSX() { + std::vector res; + std::array cache_counts {{0, 0, 0, 0}}; + GetSysctl("hw.cacheconfig", &cache_counts); + + struct { + std::string name; + std::string type; + int level; + uint64_t num_sharing; + } Cases[] = {{"hw.l1dcachesize", "Data", 1, cache_counts[1]}, + {"hw.l1icachesize", "Instruction", 1, cache_counts[1]}, + {"hw.l2cachesize", "Unified", 2, cache_counts[2]}, + {"hw.l3cachesize", "Unified", 3, cache_counts[3]}}; + for (auto& c : Cases) { + int val; + if (!GetSysctl(c.name, &val)) { continue; } + CPUInfo::CacheInfo info; + info.type = c.type; + info.level = c.level; + info.size = val; + info.num_sharing = static_cast(c.num_sharing); + res.push_back(std::move(info)); + } + return res; +} +#elif defined(BENCHMARK_OS_WINDOWS) +std::vector GetCacheSizesWindows() { + std::vector res; + DWORD buffer_size = 0; + using PInfo = SYSTEM_LOGICAL_PROCESSOR_INFORMATION; + using CInfo = CACHE_DESCRIPTOR; + + using UPtr = std::unique_ptr; + GetLogicalProcessorInformation(nullptr, &buffer_size); + UPtr buff((PInfo*)malloc(buffer_size), &std::free); + if (!GetLogicalProcessorInformation(buff.get(), &buffer_size)) + PrintErrorAndDie("Failed during call to GetLogicalProcessorInformation: ", GetLastError()); + + PInfo* it = buff.get(); + PInfo* end = buff.get() + (buffer_size / sizeof(PInfo)); + + for (; it != end; ++it) { + if (it->Relationship != RelationCache) { continue; } + using BitSet = std::bitset; + BitSet B(it->ProcessorMask); + // To prevent duplicates, only consider caches where CPU 0 is specified + if (!B.test(0)) { continue; } + CInfo* Cache = &it->Cache; + CPUInfo::CacheInfo C; + C.num_sharing = static_cast(B.count()); + C.level = Cache->Level; + C.size = Cache->Size; + switch (Cache->Type) { + case CacheUnified: + C.type = "Unified"; + break; + case CacheInstruction: + C.type = "Instruction"; + break; + case CacheData: + C.type = "Data"; + break; + case CacheTrace: + C.type = "Trace"; + break; + default: + C.type = "Unknown"; + break; + } + res.push_back(C); + } + return res; +} +#elif BENCHMARK_OS_QNX +std::vector GetCacheSizesQNX() { + std::vector res; + struct cacheattr_entry* cache = SYSPAGE_ENTRY(cacheattr); + uint32_t const elsize = SYSPAGE_ELEMENT_SIZE(cacheattr); + int num = SYSPAGE_ENTRY_SIZE(cacheattr) / elsize; + for (int i = 0; i < num; ++i) { + CPUInfo::CacheInfo info; + switch (cache->flags) { + case CACHE_FLAG_INSTR: + info.type = "Instruction"; + info.level = 1; + break; + case CACHE_FLAG_DATA: + info.type = "Data"; + info.level = 1; + break; + case CACHE_FLAG_UNIFIED: + info.type = "Unified"; + info.level = 2; + break; + case CACHE_FLAG_SHARED: + info.type = "Shared"; + info.level = 3; + break; + default: + continue; + break; + } + info.size = cache->line_size * cache->num_lines; + info.num_sharing = 0; + res.push_back(std::move(info)); + cache = SYSPAGE_ARRAY_ADJ_OFFSET(cacheattr, cache, elsize); + } + return res; +} +#endif + +static std::vector GetCacheSizes() { +#ifdef BENCHMARK_OS_MACOSX + return GetCacheSizesMacOSX(); +#elif defined(BENCHMARK_OS_WINDOWS) + return GetCacheSizesWindows(); +#elif defined(BENCHMARK_OS_QNX) + return GetCacheSizesQNX(); +#else + return GetCacheSizesFromKVFS(); +#endif +} + +FLS_BENCH_MAYBE_UNUSED std::string GetSystemName() { +#if defined(BENCHMARK_OS_WINDOWS) + std::string str; + const unsigned COUNT = MAX_COMPUTERNAME_LENGTH + 1; + TCHAR hostname[COUNT] = {'\0'}; + DWORD DWCOUNT = COUNT; + if (!GetComputerName(hostname, &DWCOUNT)) { return std::string(""); } +#ifndef UNICODE + str = std::string(hostname, DWCOUNT); +#else + // Using wstring_convert, Is deprecated in C++17 + using convert_type = std::codecvt_utf8; + std::wstring_convert converter; + std::wstring wStr(hostname, DWCOUNT); + str = converter.to_bytes(wStr); +#endif + return str; +#else // defined(BENCHMARK_OS_WINDOWS) +#ifndef HOST_NAME_MAX +#ifdef BENCHMARK_HAS_SYSCTL // BSD/Mac Doesnt have HOST_NAME_MAX defined +#define HOST_NAME_MAX 64 +#elif defined(BENCHMARK_OS_NACL) +#define HOST_NAME_MAX 64 +#elif defined(BENCHMARK_OS_QNX) +#define HOST_NAME_MAX 154 +#elif defined(BENCHMARK_OS_RTEMS) +#define HOST_NAME_MAX 256 +#else +#warning "HOST_NAME_MAX not defined. using 64" +#define HOST_NAME_MAX 64 +#endif +#endif // def HOST_NAME_MAX + char hostname[HOST_NAME_MAX]; + int retVal = gethostname(hostname, HOST_NAME_MAX); + if (retVal != 0) { return std::string(""); } + return std::string(hostname); +#endif // Catch-all POSIX block. +} + +static int GetNumCPUs() { +#ifdef BENCHMARK_HAS_SYSCTL + int NumCPU = -1; + if (GetSysctl("hw.ncpu", &NumCPU)) { return NumCPU; } + fprintf(stderr, "Err: %s\n", strerror(errno)); + std::exit(EXIT_FAILURE); +#elif defined(BENCHMARK_OS_WINDOWS) + SYSTEM_INFO sysinfo; + // Use memset as opposed to = {} to avoid GCC missing initializer false + // positives. + std::memset(&sysinfo, 0, sizeof(SYSTEM_INFO)); + GetSystemInfo(&sysinfo); + return sysinfo.dwNumberOfProcessors; // number of logical + // processors in the current + // group +#elif defined(BENCHMARK_OS_SOLARIS) + // Returns -1 in case of a failure. + int NumCPU = sysconf(_SC_NPROCESSORS_ONLN); + if (NumCPU < 0) { fprintf(stderr, "sysconf(_SC_NPROCESSORS_ONLN) failed with error: %s\n", strerror(errno)); } + return NumCPU; +#elif defined(BENCHMARK_OS_QNX) + return static_cast(_syspage_ptr->num_cpu); +#else + int NumCPUs = 0; + int MaxID = -1; + std::ifstream f("/proc/cpuinfo"); + if (!f.is_open()) { + std::cerr << "failed to open /proc/cpuinfo\n"; + return -1; + } + const std::string Key = "processor"; + std::string ln; + while (std::getline(f, ln)) { + if (ln.empty()) { continue; } + size_t SplitIdx = ln.find(':'); + std::string value; +#if defined(__s390__) + // s390 has another format in /proc/cpuinfo + // it needs to be parsed differently + if (SplitIdx != std::string::npos) { value = ln.substr(Key.size() + 1, SplitIdx - Key.size() - 1); } +#else + if (SplitIdx != std::string::npos) { value = ln.substr(SplitIdx + 1); } +#endif + if (ln.size() >= Key.size() && ln.compare(0, Key.size(), Key) == 0) { + NumCPUs++; + if (!value.empty()) { + int CurID = benchmark::stoi(value); + MaxID = std::max(CurID, MaxID); + } + } + } + if (f.bad()) { + std::cerr << "Failure reading /proc/cpuinfo\n"; + return -1; + } + if (!f.eof()) { + std::cerr << "Failed to read to end of /proc/cpuinfo\n"; + return -1; + } + f.close(); + + if ((MaxID + 1) != NumCPUs) { + fprintf(stderr, + "CPU ID assignments in /proc/cpuinfo seem messed up." + " This is usually caused by a bad BIOS.\n"); + } + return NumCPUs; +#endif + BENCHMARK_UNREACHABLE(); +} + +static double GetCPUCyclesPerSecond(CPUInfo::Scaling scaling) { + // Currently, scaling is only used on linux path here, + // suppress diagnostics about it being unused on other paths. + (void)scaling; + +#if defined BENCHMARK_OS_LINUX || defined BENCHMARK_OS_CYGWIN + long freq; + + // If the kernel is exporting the tsc frequency use that. There are issues + // where cpuinfo_max_freq cannot be relied on because the BIOS may be + // exporintg an invalid p-state (on x86) or p-states may be used to put the + // processor in a new mode (turbo mode). Essentially, those frequencies + // cannot always be relied upon. The same reasons apply to /proc/cpuinfo as + // well. + if (ReadFromFile("/sys/devices/system/cpu/cpu0/tsc_freq_khz", &freq) + // If CPU scaling is disabled, use the the *current* frequency. + // Note that we specifically don't want to read cpuinfo_cur_freq, + // because it is only readable by root. + || (scaling == CPUInfo::Scaling::DISABLED && + ReadFromFile("/sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq", &freq)) + // Otherwise, if CPU scaling may be in effect, we want to use + // the *maximum* frequency, not whatever CPU speed some random processor + // happens to be using now. + || ReadFromFile("/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq", &freq)) { + // The value is in kHz (as the file name suggests). For example, on a + // 2GHz warpstation, the file contains the value "2000000". + return freq * 1000.0; + } + + const double error_value = -1; + double bogo_clock = error_value; + + std::ifstream f("/proc/cpuinfo"); + if (!f.is_open()) { + std::cerr << "failed to open /proc/cpuinfo\n"; + return error_value; + } + + auto startsWithKey = [](std::string const& Value, std::string const& Key) { + if (Key.size() > Value.size()) { return false; } + auto Cmp = [&](char X, char Y) { + return std::tolower(X) == std::tolower(Y); + }; + return std::equal(Key.begin(), Key.end(), Value.begin(), Cmp); + }; + + std::string ln; + while (std::getline(f, ln)) { + if (ln.empty()) { continue; } + size_t SplitIdx = ln.find(':'); + std::string value; + if (SplitIdx != std::string::npos) { value = ln.substr(SplitIdx + 1); } + // When parsing the "cpu MHz" and "bogomips" (fallback) entries, we only + // accept positive values. Some environments (virtual machines) report zero, + // which would cause infinite looping in WallTime_Init. + if (startsWithKey(ln, "cpu MHz")) { + if (!value.empty()) { + double cycles_per_second = benchmark::stod(value) * 1000000.0; + if (cycles_per_second > 0) { return cycles_per_second; } + } + } else if (startsWithKey(ln, "bogomips")) { + if (!value.empty()) { + bogo_clock = benchmark::stod(value) * 1000000.0; + if (bogo_clock < 0.0) { bogo_clock = error_value; } + } + } + } + if (f.bad()) { + std::cerr << "Failure reading /proc/cpuinfo\n"; + return error_value; + } + if (!f.eof()) { + std::cerr << "Failed to read to end of /proc/cpuinfo\n"; + return error_value; + } + f.close(); + // If we found the bogomips clock, but nothing better, we'll use it (but + // we're not happy about it); otherwise, fallback to the rough estimation + // below. + if (bogo_clock >= 0.0) { return bogo_clock; } + +#elif defined BENCHMARK_HAS_SYSCTL + constexpr auto* FreqStr = +#if defined(BENCHMARK_OS_FREEBSD) || defined(BENCHMARK_OS_NETBSD) + "machdep.tsc_freq"; +#elif defined BENCHMARK_OS_OPENBSD + "hw.cpuspeed"; +#elif defined BENCHMARK_OS_DRAGONFLY + "hw.tsc_frequency"; +#else + "hw.cpufrequency"; +#endif + unsigned long long hz = 0; +#if defined BENCHMARK_OS_OPENBSD + if (GetSysctl(FreqStr, &hz)) { return hz * 1000000; } +#else + if (GetSysctl(FreqStr, &hz)) { return hz; } +#endif + fprintf(stderr, "Unable to determine clock rate from sysctl: %s: %s\n", FreqStr, strerror(errno)); + +#elif defined BENCHMARK_OS_WINDOWS + // In NT, read MHz from the registry. If we fail to do so or we're in win9x + // then make a crude estimate. + DWORD data, data_size = sizeof(data); + if (IsWindowsXPOrGreater() && SUCCEEDED(SHGetValueA(HKEY_LOCAL_MACHINE, + "HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0", + "~MHz", + nullptr, + &data, + &data_size))) + return static_cast((int64_t)data * (int64_t)(1000 * 1000)); // was mhz +#elif defined(BENCHMARK_OS_SOLARIS) + kstat_ctl_t* kc = kstat_open(); + if (!kc) { + std::cerr << "failed to open /dev/kstat\n"; + return -1; + } + kstat_t* ksp = kstat_lookup(kc, (char*)"cpu_info", -1, (char*)"cpu_info0"); + if (!ksp) { + std::cerr << "failed to lookup in /dev/kstat\n"; + return -1; + } + if (kstat_read(kc, ksp, NULL) < 0) { + std::cerr << "failed to read from /dev/kstat\n"; + return -1; + } + kstat_named_t* knp = (kstat_named_t*)kstat_data_lookup(ksp, (char*)"current_clock_Hz"); + if (!knp) { + std::cerr << "failed to lookup data in /dev/kstat\n"; + return -1; + } + if (knp->data_type != KSTAT_DATA_UINT64) { + std::cerr << "current_clock_Hz is of unexpected data type: " << knp->data_type << "\n"; + return -1; + } + double clock_hz = knp->value.ui64; + kstat_close(kc); + return clock_hz; +#elif defined(BENCHMARK_OS_QNX) + return static_cast((int64_t)(SYSPAGE_ENTRY(cpuinfo)->speed) * (int64_t)(1000 * 1000)); +#endif + // If we've fallen through, attempt to roughly estimate the CPU clock rate. + const int estimate_time_ms = 1000; + cycleclock::Init(); + const auto start_ticks = cycleclock::Now(); + SleepForMilliseconds(estimate_time_ms); + return static_cast(cycleclock::Now() - start_ticks); +} + +static std::vector GetLoadAvg() { +#if (defined BENCHMARK_OS_FREEBSD || defined(BENCHMARK_OS_LINUX) || defined BENCHMARK_OS_MACOSX || \ + defined BENCHMARK_OS_NETBSD || defined BENCHMARK_OS_OPENBSD || defined BENCHMARK_OS_DRAGONFLY) && \ + !defined(__ANDROID__) + constexpr int kMaxSamples = 3; + std::vector res(kMaxSamples, 0.0); + const int nelem = getloadavg(res.data(), kMaxSamples); + if (nelem < 1) { + res.clear(); + } else { + res.resize(nelem); + } + return res; +#else + return {}; +#endif +} + +// private constructor +CPUInfo::CPUInfo() + : num_cpus(GetNumCPUs()) + , scaling(CpuScaling(num_cpus)) + , cycles_per_second(GetCPUCyclesPerSecond(scaling)) + , caches(GetCacheSizes()) + , load_avg(GetLoadAvg()) {} + +struct SystemInfo { + + static std::string GetSystemName() { +#if defined(BENCHMARK_OS_WINDOWS) + std::string str; + const unsigned COUNT = MAX_COMPUTERNAME_LENGTH + 1; + TCHAR hostname[COUNT] = {'\0'}; + DWORD DWCOUNT = COUNT; + if (!GetComputerName(hostname, &DWCOUNT)) { return std::string(""); } +#ifndef UNICODE + str = std::string(hostname, DWCOUNT); +#else + // Using wstring_convert, Is deprecated in C++17 + using convert_type = std::codecvt_utf8; + std::wstring_convert converter; + std::wstring wStr(hostname, DWCOUNT); + str = converter.to_bytes(wStr); +#endif + return str; +#else // defined(BENCHMARK_OS_WINDOWS) +#ifndef HOST_NAME_MAX +#ifdef BENCHMARK_HAS_SYSCTL // BSD/Mac Doesnt have HOST_NAME_MAX defined +#define HOST_NAME_MAX 64 +#elif defined(BENCHMARK_OS_NACL) +#define HOST_NAME_MAX 64 +#elif defined(BENCHMARK_OS_QNX) +#define HOST_NAME_MAX 154 +#elif defined(BENCHMARK_OS_RTEMS) +#define HOST_NAME_MAX 256 +#else +#warning "HOST_NAME_MAX not defined. using 64" +#define HOST_NAME_MAX 64 +#endif +#endif // def HOST_NAME_MAX + char hostname[HOST_NAME_MAX]; + int ret_val = gethostname(hostname, HOST_NAME_MAX); + if (ret_val != 0) { return std::string(""); } + return {hostname}; +#endif // Catch-all POSIX block. + } + + static const SystemInfo& getInstance() { + static const SystemInfo INFO; + return INFO; + } + +private: + SystemInfo() + : m_name(GetSystemName()) {} + +public: + const std::string& getName() const { return m_name; } + +private: + std::string m_name; + + BENCHMARK_DISALLOW_COPY_AND_ASSIGN(SystemInfo); +}; + +static std::string LocalDateTimeString() { + // Write the local time in RFC3339 format yyyy-mm-ddTHH:MM:SS+/-HH:MM. + using Clock = std::chrono::system_clock; + std::time_t now = Clock::to_time_t(Clock::now()); + const std::size_t kTzOffsetLen = 6; + const std::size_t kTimestampLen = 19; + + std::size_t tz_len; + std::size_t timestamp_len; + long int offset_minutes; + char tz_offset_sign = '+'; + // tz_offset is set in one of three ways: + // * strftime with %z - This either returns empty or the ISO 8601 time. The + // maximum length an + // ISO 8601 string can be is 7 (e.g. -03:30, plus trailing zero). + // * snprintf with %c%02li:%02li - The maximum length is 41 (one for %c, up to + // 19 for %02li, + // one for :, up to 19 %02li, plus trailing zero). + // * A fixed string of "-00:00". The maximum length is 7 (-00:00, plus + // trailing zero). + // + // Thus, the maximum size this needs to be is 41. + char tz_offset[41]; + // Long enough buffer to avoid format-overflow warnings + char storage[128]; + +#if defined(BENCHMARK_OS_WINDOWS) + std::tm* timeinfo_p = ::localtime(&now); +#else + std::tm timeinfo; + std::tm* timeinfo_p = &timeinfo; + ::localtime_r(&now, &timeinfo); +#endif + + tz_len = std::strftime(tz_offset, sizeof(tz_offset), "%z", timeinfo_p); + + if (tz_len < kTzOffsetLen && tz_len > 1) { + // Timezone offset was written. strftime writes offset as +HHMM or -HHMM, + // RFC3339 specifies an offset as +HH:MM or -HH:MM. To convert, we parse + // the offset as an integer, then reprint it to a string. + + offset_minutes = ::strtol(tz_offset, NULL, 10); + if (offset_minutes < 0) { + offset_minutes *= -1; + tz_offset_sign = '-'; + } + + tz_len = ::snprintf( + tz_offset, sizeof(tz_offset), "%c%02li:%02li", tz_offset_sign, offset_minutes / 100, offset_minutes % 100); + ((void)tz_len); // Prevent unused variable warning in optimized build. + } else { + // Unknown offset. RFC3339 specifies that unknown local offsets should be + // written as UTC time with -00:00 timezone. +#if defined(BENCHMARK_OS_WINDOWS) + // Potential race condition if another thread calls localtime or gmtime. + timeinfo_p = ::gmtime(&now); +#else + ::gmtime_r(&now, &timeinfo); +#endif + + strncpy(tz_offset, "-00:00", kTzOffsetLen + 1); + } + + timestamp_len = std::strftime(storage, sizeof(storage), "%Y-%m-%dT%H:%M:%S", timeinfo_p); + // Prevent unused variable warning in optimized build. + ((void)kTimestampLen); + + std::strncat(storage, tz_offset, sizeof(storage) - timestamp_len - 1); + return {storage}; +} + +class CPUInfo; +struct SystemInfo; +class BenchmarkReporter { +public: + struct Context { + CPUInfo const& cpu_info; + SystemInfo const& sys_info; + // The number of chars in the longest benchmark name. + size_t name_field_width; + + Context() + : cpu_info(CPUInfo::getInstance()) + , sys_info(SystemInfo::getInstance()) {} + }; + + class Run { + public: + static const int64_t no_repetition_index = -1; + enum RunType { RT_Iteration, RT_Aggregate }; + + explicit Run(int benchmark_number, std::string& name, uint64_t iterations, double cycles_per_tuple) + : benchmark_number(benchmark_number) + , name(name) + , iterations(iterations) + , cycles_per_tuple(cycles_per_tuple) + // time_unit(kNanosecond), + // real_accumulated_time(0), + // cpu_accumulated_time(0), + // max_heapbytes_used(0), + // complexity(oNone), + // complexity_lambda(), + // complexity_n(0), + // report_big_o(false), + // report_rms(false), + // counters(), + // has_memory_result(false), + // allocs_per_iter(0.0), + // max_bytes_used(0) + {} + + std::string benchmark_name() const; + int benchmark_number; + std::string name; + uint64_t iterations; + RunType run_type; + std::string aggregate_name; + bool error_occurred; + std::string error_message; + + // TimeUnit time_unit; + double cycles_per_tuple; + + // Memory metrics. + bool has_memory_result; + double allocs_per_iter; + int64_t max_bytes_used; + }; + + // Construct a BenchmarkReporter with the output stream set to 'std::cout' + // and the error stream set to 'std::cerr' + BenchmarkReporter() + : output_stream_(&std::cout) + , error_stream_(&std::cerr) {} + + // Called once for every suite of benchmarks run. + // The parameter "context" contains information that the + // reporter may wish to use when generating its report, for example the + // platform under which the benchmarks are running. The benchmark run is + // never started if this function returns false, allowing the reporter + // to skip runs based on the context information. + virtual bool ReportContext(const Context& context) = 0; + + // Called once for each group of benchmark runs, gives information about + // cpu-time and heap memory usage during the benchmark run. If the group + // of runs contained more than two entries then 'report' contains additional + // elements representing the mean and standard deviation of those runs. + // Additionally if this group of runs was the last in a family of benchmarks + // 'reports' contains additional entries representing the asymptotic + // complexity and RMS of that benchmark family. + virtual void ReportRuns(std::vector& report) = 0; + + // Called once and only once after ever group of benchmarks is run and + // reported. + virtual void Finalize() {} + + // REQUIRES: The object referenced by 'out' is valid for the lifetime + // of the reporter. + void SetOutputStream(std::ostream* out) { + assert(out); + output_stream_ = out; + } + + // REQUIRES: The object referenced by 'err' is valid for the lifetime + // of the reporter. + void SetErrorStream(std::ostream* err) { + assert(err); + error_stream_ = err; + } + + static std::ostream& GetOutputStream() { return std::cout; } + + static std::ostream& GetErrorStream() { return std::cerr; } + + virtual ~BenchmarkReporter(); + + // Write a human readable string to 'out' representing the specified + // 'context'. + // REQUIRES: 'out' is non-null. + static void PrintBasicContext(std::ostream& out, Context const& context) { + // CHECK(out) << "cannot be null"; + auto& Out = out; + + Out << LocalDateTimeString() << "\n"; + + const CPUInfo& info = context.cpu_info; + Out << "Run on (" << info.num_cpus << " X " << (info.cycles_per_second / 1000000.0) << " MHz CPU " + << ((info.num_cpus > 1) ? "s" : "") << ")\n"; + if (info.caches.size() != 0) { + Out << "CPU Caches:\n"; + for (auto& CInfo : info.caches) { + Out << " L" << CInfo.level << " " << CInfo.type << " " << (CInfo.size / 1024) << " KiB"; + if (CInfo.num_sharing != 0) { Out << " (x" << (info.num_cpus / CInfo.num_sharing) << ")"; } + Out << "\n"; + } + } + if (!info.load_avg.empty()) { + Out << "Load Average: "; + for (auto It = info.load_avg.begin(); It != info.load_avg.end();) { + Out << StrFormat("%.2f", *It++); + if (It != info.load_avg.end()) { Out << ", "; } + } + Out << "\n"; + } + + if (CPUInfo::Scaling::ENABLED == info.scaling) { + Out << "***WARNING*** CPU scaling is enabled, the benchmark " + "real time measurements may be noisy and will incur extra " + "overhead.\n"; + } + +#ifndef NDEBUG + Out << "***WARNING*** Library was built as DEBUG. Timings may be " + "affected.\n"; +#endif + } + +private: + std::ostream* output_stream_; + std::ostream* error_stream_; +}; + +// https://github.com/emscripten-core/emscripten/wiki/Emterpreter or use node.js +// https://stackoverflow.com/questions/32573289/text-written-to-stdout-doesnt-appear-until-program-completion +static void printRun(std::ostream& out, benchmark::BenchmarkReporter::Run& run) { + out << run.benchmark_number << ","; + out << run.name << ","; + out << run.iterations << ","; + out << run.cycles_per_tuple; + out << '\n'; +} + +FLS_BENCH_MAYBE_UNUSED static std::string CsvEscape(const std::string& s) { + std::string tmp; + tmp.reserve(s.size() + 2); + for (char c : s) { + switch (c) { + case '"': + tmp += "\"\""; + break; + default: + tmp += c; + break; + } + } + return '"' + tmp + '"'; +} + +static void printHeader(std::ostream& out) { + out << "benchmark_number,"; + out << "name,"; + out << "iterations,"; + out << "cycles_per_tuple"; + out << "\n"; +} + +class CSVReporter : public BenchmarkReporter { +public: + explicit CSVReporter(std::string path) + : path(std::move(path)) + , printed_header(false) {} + bool ReportContext(const Context& context) override { + PrintBasicContext(GetErrorStream(), context); + return true; + } + static void PrintContext() { PrintBasicContext(GetErrorStream(), benchmark::BenchmarkReporter::Context()); } + static void WriteRuns(std::vector& reports, const std::string& path) { +#ifdef BENCHMARK_OS_EMSCRIPTEN + // alternatives : + // https://stackoverflow.com/questions/67174663/cannot-save-the-file-to-specific-directory-by-wasm + std::cerr << "Modern web browsers do not allow web pages to write/open a local file in your machine."; +#else + std::fstream file; + file.open(path, std::fstream::out); + printHeader(file); + + // print results for each run + for (auto& run : reports) { + printRun(file, run); + } + + if (file.fail()) { + std::cerr << "Error: " << strerror(errno) << ": " << path << "\n"; + throw std::exception(); + } + + std::cout << "benchmark result has been writen at " << path << '\n'; +#endif + } + static void WriteContext(std::string& cmake_info, const std::string& path) { +#ifdef BENCHMARK_OS_EMSCRIPTEN + // alternatives : + // https://stackoverflow.com/questions/67174663/cannot-save-the-file-to-specific-directory-by-wasm + std::cerr << "Modern web browsers do not allow web pages to write/open a local file in your machine."; +#else + std::fstream file; + file.open(path, std::fstream::out); + PrintBasicContext(file, benchmark::BenchmarkReporter::Context()); + file << cmake_info; + if (file.fail()) { + std::cerr << "Error: " << strerror(errno) << ": " << path << "\n"; + throw std::exception(); + } + + std::cout << "benchmark metadata has been writen at " << path << '\n'; +#endif + } + static void PrintRun(Run& run) { + std::ostream& out = GetOutputStream(); + printRun(out, run); + } + +public: + const std::string path; + bool printed_header; + std::set user_counter_names; +}; + +using report = benchmark::BenchmarkReporter::Run; + +class Benchmark { +private: + explicit Benchmark(std::string name) + : m_name(std::move(name)) + , m_cpu_info(CPUInfo::getInstance()) + , m_system_info(SystemInfo::getInstance()) + , m_enable_print(false) + , m_result_file {' '} + , m_metadata_file {' '} {} // +public: + friend class BenchmarkBuilder; + void Run(benchmark::BenchmarkReporter::Run run) { + if (m_enable_print) { benchmark::CSVReporter::PrintRun(run); } + + m_runs.push_back(run); + } + ~Benchmark() { + if (m_enable_save && !m_runs.empty()) { + CSVReporter::WriteRuns(m_runs, m_result_file); + CSVReporter::WriteContext(m_extra_info, m_metadata_file); + } + } // +private: + std::string m_name; + const CPUInfo& m_cpu_info; + const SystemInfo& m_system_info; + std::string m_extra_info; + bool m_enable_save; + bool m_enable_print; + std::string m_result_file; + std::string m_metadata_file; + std::string m_dir; + std::vector m_runs; +}; + +class CmakeInfo; +constexpr auto METADATA_PREFIX {".metadata"}; +constexpr auto CSV_PREFIX {".csv"}; + +class BenchmarkBuilder { +public: + explicit BenchmarkBuilder(std::string name) + : m_benchmark(std::move(name)) {} // +public: + operator Benchmark() const { return m_benchmark; } + benchmark::BenchmarkBuilder& save() { + m_benchmark.m_metadata_file = "./" + m_benchmark.m_metadata_file + METADATA_PREFIX; + m_benchmark.m_result_file = "./" + m_benchmark.m_metadata_file + CSV_PREFIX; + m_benchmark.m_enable_print = true; + return *this; + } + benchmark::BenchmarkBuilder& print() { + m_benchmark.m_enable_save = true; + return *this; + } + benchmark::BenchmarkBuilder& at(const std::string& dir) { + m_benchmark.m_metadata_file = dir + "/" + m_benchmark.m_name + METADATA_PREFIX; + m_benchmark.m_result_file = dir + "/" + m_benchmark.m_name + CSV_PREFIX; + return *this; + } + benchmark::BenchmarkBuilder& add_extra_info(const std::string& info) { + m_benchmark.m_extra_info += info; + return *this; + } + +private: + Benchmark m_benchmark; // +}; + +FLS_BENCH_MAYBE_UNUSED static BenchmarkBuilder create(const std::string& name) { + cycleclock::Init(); + return BenchmarkBuilder(name); +} + +class CmakeInfo { +public: + static const CmakeInfo& getInstance() { + static const CmakeInfo INFO; + return INFO; + } + +public: + const std::string source_dir = SOURCE_DIR; + const std::string cmake_osx_architectures = CMAKE_OSX_ARCHITECTURES; + const std::string cmake_host_system_processor = CMAKE_HOST_SYSTEM_PROCESSOR; + const std::string cmake_system_processor = CMAKE_SYSTEM_PROCESSOR; + const std::string cmake_host_system_name = CMAKE_HOST_SYSTEM_NAME; + const std::string cmake_system_name = CMAKE_SYSTEM_NAME; + const std::string cmake_c_compiler = CMAKE_C_COMPILER; + const std::string cmake_cxx_compiler = CMAKE_CXX_COMPILER; + const std::string cmake_cxx_compiler_id = CMAKE_CXX_COMPILER_ID; + const std::string cmake_cxx_compiler_version = CMAKE_CXX_COMPILER_VERSION; + const std::string cmake_crosscompiling = CMAKE_CROSSCOMPILING; + const std::string cmake_cxx_flags_debug = CMAKE_CXX_FLAGS_DEBUG; + const std::string cmake_cxx_flags_release = CMAKE_CXX_FLAGS_RELEASE; + const std::string cmake_build_type = CMAKE_BUILD_TYPE; + const std::string cmake_toolchain_file = CMAKE_TOOLCHAIN_FILE; + const std::string target_name = TARGET_NAME; + const std::string target_compile_options = TARGET_COMPILE_OPTIONS; + +public: + const std::string& getSourceDir() const { return source_dir; } + const std::string& getCmakeOsxArchitectures() const { return cmake_osx_architectures; } + const std::string& getCmakeHostSystemProcessor() const { return cmake_host_system_processor; } + const std::string& getCmakeSystemProcessor() const { return cmake_system_processor; } + const std::string& getCmakeHostSystemName() const { return cmake_host_system_name; } + const std::string& getCmakeSystemName() const { return cmake_system_name; } + const std::string& getCmakeCCompiler() const { return cmake_c_compiler; } + const std::string& getCmakeCxxCompiler() const { return cmake_cxx_compiler; } + const std::string& getCmakeCxxCompilerId() const { return cmake_cxx_compiler_id; } + const std::string& getCmakeCxxCompilerVersion() const { return cmake_cxx_compiler_version; } + const std::string& getCmakeCrosscompiling() const { return cmake_crosscompiling; } + const std::string& getCmakeCxxFlagsDebug() const { return cmake_cxx_flags_debug; } + const std::string& getCmakeCxxFlagsRelease() const { return cmake_cxx_flags_release; } + const std::string& getCmakeBuildType() const { return cmake_build_type; } + const std::string& get_cmakeToolchainFile() const { return cmake_toolchain_file; } + const std::string& getTargetName() const { return target_name; } + const std::string& getTargetCompileOptions() const { return target_compile_options; } + + static void PrintCmake() { printCmakeInfo(std::cout); } + static void AppendCmake(const std::string& path) { + std::fstream file; + file.open(path, std::fstream::app); + printCmakeInfo(file); + + if (file.fail()) { + std::cerr << "Error: " << strerror(errno) << "\n"; + throw std::exception(); + } + + std::cout << "result has been writen at " + path; + } + static void printCmakeInfo(std::ostream& out) { + const CmakeInfo& info = getInstance(); + out << info.getCmakeInfo(); + } + + // https://stackoverflow.com/a/46931770/5165633 + static std::vector split(const std::string& s, char delim) { + std::vector result; + std::stringstream ss(s); + std::string item; + + while (getline(ss, item, delim)) { + result.push_back(item); + } + + return result; + } + + static std::string getCmakeToolchainFile() { + const CmakeInfo& info = getInstance(); + std::vector v = split(info.get_cmakeToolchainFile(), '/'); + + auto tool_chain_file_str = v[v.size() - 1]; + return tool_chain_file_str.substr(0, tool_chain_file_str.size() - 6); + } + + static std::string getCmakeInfo() { + std::ostringstream out; + const CmakeInfo& info = getInstance(); + out << "cmake info: \n"; + out << " source_dir: " << info.getSourceDir() << '\n'; + out << " cmake_osx_architectures" << info.getCmakeOsxArchitectures() << '\n'; + out << " cmake_host_system_processor: " << info.getCmakeHostSystemProcessor() << '\n'; + out << " cmake_system_processor: " << info.getCmakeSystemProcessor() << '\n'; + out << " cmake_host_system_name: " << info.getCmakeHostSystemName() << '\n'; + out << " cmake_system_name: " << info.getCmakeSystemName() << '\n'; + out << " cmake_c_compiler: " << info.getCmakeCCompiler() << '\n'; + out << " cmake_cxx_compiler: " << info.getCmakeCxxCompiler() << '\n'; + out << " cmake_cxx_compiler_id: " << info.getCmakeCxxCompilerId() << '\n'; + out << " cmake_cxx_compiler_version: " << info.getCmakeCxxCompilerVersion() << '\n'; + out << " cmake_crosscompiling: " << info.getCmakeCrosscompiling() << '\n'; + out << " cmake_cxx_flags_debug: " << info.getCmakeCxxFlagsDebug() << '\n'; + out << " cmake_cxx_flags_release: " << info.getCmakeCxxFlagsRelease() << '\n'; + out << " cmake_build_type: " << info.getCmakeBuildType() << '\n'; + out << " cmake_toolchain_file: " << info.getCmakeToolchainFile() << '\n'; + out << "target info: \n"; + out << " target_name: " << info.getTargetName() << '\n'; + out << " target_compile_options: " << info.getTargetCompileOptions() << '\n'; + return out.str(); + } + +private: + CmakeInfo() = default; // + BENCHMARK_DISALLOW_COPY_AND_ASSIGN(CmakeInfo); // +}; +} // namespace benchmark +#endif diff --git a/benchmarks/analyze_better_blocks/alp_pub/results/i4i/README.md b/benchmarks/analyze_better_blocks/alp_pub/results/i4i/README.md new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/alp_pub/results/i4i/ped.csv b/benchmarks/analyze_better_blocks/alp_pub/results/i4i/ped.csv new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/alp_pub/results/i4i/ped.metadata b/benchmarks/analyze_better_blocks/alp_pub/results/i4i/ped.metadata new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/alp_pub/results/president/ped.csv b/benchmarks/analyze_better_blocks/alp_pub/results/president/ped.csv new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/benchmark/local.cmake b/benchmarks/analyze_better_blocks/benchmark/local.cmake new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/benchmark/placeholder.cpp b/benchmarks/analyze_better_blocks/benchmark/placeholder.cpp new file mode 100644 index 0000000..2ef72c1 --- /dev/null +++ b/benchmarks/analyze_better_blocks/benchmark/placeholder.cpp @@ -0,0 +1,10 @@ +#include +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +using namespace std; +int main(int argc, char **argv) { + + cout << "Hello BtrBlocks" << endl; + return 0; +} +// ------------------------------------------------------------------------------------- diff --git a/benchmarks/analyze_better_blocks/cengine/CMakeLists.txt b/benchmarks/analyze_better_blocks/cengine/CMakeLists.txt new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/cengine/analysis/Analysis.cpp b/benchmarks/analyze_better_blocks/cengine/analysis/Analysis.cpp new file mode 100644 index 0000000..d5e2ef6 --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/analysis/Analysis.cpp @@ -0,0 +1,202 @@ +#include "Units.hpp" +#include "MMapvector.hpp" +#include "Exceptions.hpp" +#include "parser/Parser.hpp" +#include "datablock/Datablock.hpp" +#include "datablock/CMachine.hpp" +#include "datablock/schemes/CSchemePool.hpp" +#include "analysis/Analysis.hpp" +// ------------------------------------------------------------------------------------- +#include "gflags/gflags.h" +#include "yaml-cpp/yaml.h" +#include "spdlog/spdlog.h" +#include "spdlog/sinks/rotating_file_sink.h" +#include "tbb/parallel_for.h" +#include "tbb/task_scheduler_init.h" +// ------------------------------------------------------------------------------------- +#include +#include +// ------------------------------------------------------------------------------------- +DEFINE_uint32(block_print_length, 20, "."); +DEFINE_uint32(block_count, 3, "."); +DEFINE_uint32(block_length, 65000, "."); +// ------------------------------------------------------------------------------------- +namespace cengine { +vector> analyzeRelation(Relation &relation) +{ + std::mutex sync_mutex; + std::vector> relation_analysis(relation.columns.size()); + tbb::parallel_for(SIZE(0), relation.columns.size(), [&](SIZE col_i) { + auto &column = relation.columns[col_i]; + auto tuple_count = relation.tuple_count; + auto &bitmap = column.bitmap; + map column_analysis; + column_analysis["c_tuple_count"] = to_string(relation.tuple_count); + column_analysis["c_size"] = to_string(column.sizeInBytes()); + // ------------------------------------------------------------------------------------- + map all_tuples; + switch ( column.type ) { + case ColumnType::INTEGER : { + all_tuples = analyzeBlock(column.integers(), bitmap, 0, tuple_count, true); + break; + } + case ColumnType::DOUBLE: { + all_tuples = analyzeBlock(column.doubles(), bitmap, 0, tuple_count, true); + break; + } + case ColumnType::STRING: { + all_tuples = analyzeStrBlock(column.strings(), bitmap, 0, tuple_count, true); + break; + } + default : { + UNREACHABLE(); + } + } + for ( const auto &element: all_tuples ) { + column_analysis["c_" + element.first] = element.second; + } + // ------------------------------------------------------------------------------------- + for ( u32 block_i = 1; block_i <= FLAGS_block_count; block_i++ ) { + u32 start_index = rand() % tuple_count; + u32 block_length = std::min(FLAGS_block_length, static_cast(tuple_count - start_index)); + map block_tuples; + switch ( column.type ) { + case ColumnType::INTEGER : { + block_tuples = analyzeBlock(column.integers(), bitmap, start_index, block_length, true); + break; + } + case ColumnType::DOUBLE: { + block_tuples = analyzeBlock(column.doubles(), bitmap, start_index, block_length, true); + break; + } + case ColumnType::STRING: { + block_tuples = analyzeStrBlock(column.strings(), bitmap, start_index, block_length, true); + break; + } + default : { + UNREACHABLE(); + } + } + for ( const auto &element: block_tuples ) { + column_analysis["b_" + to_string(block_i) + "_" + element.first] = element.second; + } + } + // ------------------------------------------------------------------------------------- + { + std::lock_guard lock(sync_mutex); + relation_analysis[col_i] = column_analysis; + } + }); + return relation_analysis; +} +// ------------------------------------------------------------------------------------- +map analyzeStrBlock(const Vector &column, Vector &bitmap, u32 start_index, u32 tuple_count, bool print_block) +{ + // CAN NOT simply substr strings, because we have multibyte chars (:@) + map stats; + stats["random_element"] = "-"; + bool is_starting_values_initialized = false; + u32 min_length = column[start_index].length(), max_length = column[start_index].length(); + u32 null_count = 0; + u32 zero_count = 0; + std::unordered_map frequency; + u64 sum_length = 0; + + for ( u32 tuple_i = start_index; tuple_i < start_index + tuple_count; tuple_i++ ) { + BITMAP is_set = bitmap.data[tuple_i]; + + if ( !is_set ) { + null_count++; + continue; + } + + auto current_value = column[tuple_i]; + if ( current_value.size() == 0 ) { + zero_count++; + } + + if ( frequency.find(current_value) == frequency.end()) { + frequency.insert({current_value, 1}); + } else { + frequency[current_value] = frequency[current_value] + 1; + } + + if ( is_starting_values_initialized ) { + if ( current_value.length() > max_length ) + max_length = current_value.length(); + if ( current_value.length() < min_length ) + min_length = current_value.length(); + } else { + is_starting_values_initialized = true; + min_length = max_length = current_value.length(); + stats["random_element"] = current_value; + } + + sum_length += current_value.length(); + } + const u32 set_count = tuple_count - null_count; + const u32 unique_count = frequency.size(); + { + using Comparator = function, pair)>; + // Defining a lambda function to compare two pairs. It will compare two pairs using second field + Comparator compFunctor = + [](pair elem1, pair elem2) { + return elem1.second > elem2.second; + }; + // Declaring a set that will store the pairs using above comparision logic + set, Comparator> frequency_set(frequency.begin(), frequency.end(), compFunctor); + u32 top_i = 1; + for ( const auto &element: frequency_set ) { + str value = element.first; + double frequency = static_cast(element.second) * 100.0 / static_cast(set_count); + string key_prefix = "top_" + to_string(top_i); + string value_key = key_prefix + "_value"; + string percent_key = key_prefix + "_percent"; + + stats[value_key] = value; + stats[percent_key] = to_string(frequency); + if ( top_i++ == ((unique_count >= 3) ? 3 : unique_count)) { + break; + } + } + for ( ; top_i <= 3; top_i++ ) { + string key_prefix = "top_" + to_string(top_i); + string value_key = key_prefix + "_value"; + string percent_key = key_prefix + "_percent"; + + stats[value_key] = ""; + stats[percent_key] = ""; + } + } + float average_length = sum_length / tuple_count; + + if ( is_starting_values_initialized ) { + stats["min"] = to_string(min_length); + stats["max"] = to_string(max_length); + } else { + stats["min"] = "-"; + stats["max"] = "-"; + } + stats["null_count"] = to_string(null_count); + stats["zero_count"] = to_string(zero_count); + stats["unique_count"] = to_string(unique_count); + stats["average_length"] = to_string(average_length); + // ------------------------------------------------------------------------------------- + if ( print_block ) { + string block_rep = ""; + for ( u32 tuple_i = start_index + 1; tuple_i < start_index + FLAGS_block_print_length; tuple_i++ ) { + BITMAP is_set = bitmap.data[tuple_i]; + if ( !is_set ) { + block_rep += "N"; + } else if ( column[tuple_i] == column[tuple_i - 1] ) { + block_rep += "."; + } else { + block_rep += "x"; + } + } + stats["block"] = block_rep; + } + // ------------------------------------------------------------------------------------- + return stats; +} +} \ No newline at end of file diff --git a/benchmarks/analyze_better_blocks/cengine/analysis/Analysis.hpp b/benchmarks/analyze_better_blocks/cengine/analysis/Analysis.hpp new file mode 100644 index 0000000..bc6d022 --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/analysis/Analysis.hpp @@ -0,0 +1,133 @@ +#include "Units.hpp" +#include "MMapvector.hpp" +#include "storage/Relation.hpp" +// ------------------------------------------------------------------------------------- +#include "gflags/gflags.h" +#include "tbb/parallel_for.h" +#include "tbb/task_scheduler_init.h" +// ------------------------------------------------------------------------------------- +#include +#include +#include +#include +// ------------------------------------------------------------------------------------- +using namespace std; + +// ------------------------------------------------------------------------------------- +DECLARE_uint32(block_print_length); +DECLARE_uint32(block_count); +DECLARE_uint32(block_length); +// ------------------------------------------------------------------------------------- +namespace cengine { +// ------------------------------------------------------------------------------------- +template +map analyzeBlock(const Vector &column, Vector &bitmap, u32 start_index, u32 tuple_count, bool print_block = false) +{ + map stats; + stats["random_element"] = "-"; + T min, max; + bool is_starting_values_initialized = false; + u32 null_count = 0; + u32 zero_count = 0; + std::unordered_map frequency; + + for ( u32 tuple_i = start_index; tuple_i < start_index + tuple_count; tuple_i++ ) { + BITMAP is_set = bitmap.data[tuple_i]; + + if ( !is_set ) { + null_count++; + continue; + } + + auto current_value = column[tuple_i]; + if ( current_value == 0 ) { + zero_count++; + } + + if ( frequency.find(current_value) == frequency.end()) { + frequency.insert({current_value, 1}); + } else { + frequency[current_value] = frequency[current_value] + 1; + } + + if ( is_starting_values_initialized ) { + if ( current_value > max ) + max = current_value; + if ( current_value < min ) + min = current_value; + } else { + is_starting_values_initialized = true; + min = max = current_value; + stats["random_element"] = to_string(column[start_index]); + } + + } + const u32 set_count = tuple_count - null_count; + const u32 unique_count = frequency.size(); + { + using Comparator = std::function, pair)>; + // Defining a lambda function to compare two pairs. It will compare two pairs using second field + Comparator compFunctor = + [](pair elem1, pair elem2) { + return elem1.second > elem2.second; + }; + // Declaring a set that will store the pairs using above comparision logic + set, Comparator> frequency_set(frequency.begin(), frequency.end(), compFunctor); + u32 top_i = 1; + for ( const auto &element: frequency_set ) { + T value = element.first; + double frequency = static_cast(element.second) * 100.0 / static_cast(set_count); + string key_prefix = "top_" + to_string(top_i); + string value_key = key_prefix + "_value"; + string percent_key = key_prefix + "_percent"; + + stats[value_key] = to_string(value); + stats[percent_key] = to_string(frequency); + if ( top_i++ == ((unique_count >= 3) ? 3 : unique_count)) { + break; + } + } + for ( ; top_i <= 3; top_i++ ) { + string key_prefix = "top_" + to_string(top_i); + string value_key = key_prefix + "_value"; + string percent_key = key_prefix + "_percent"; + + stats[value_key] = ""; + stats[percent_key] = "0"; + } + } + + if ( is_starting_values_initialized ) { + stats["min"] = to_string(min); + stats["max"] = to_string(max); + } else { + stats["min"] = "-"; + stats["max"] = "-"; + } + stats["null_count"] = to_string(null_count); + stats["zero_count"] = to_string(zero_count); + stats["unique_count"] = to_string(unique_count); + stats["average_length"] = "-"; + // ------------------------------------------------------------------------------------- + if ( print_block ) { + string block_rep = ""; + for ( u32 tuple_i = start_index + 1; tuple_i < start_index + FLAGS_block_print_length; tuple_i++ ) { + BITMAP is_set = bitmap.data[tuple_i]; + if ( !is_set ) { + block_rep += "N"; + } else if ( column[tuple_i] == column[tuple_i - 1] ) { + block_rep += "."; + } else { + block_rep += "x"; + } + } + stats["block"] = block_rep; + } + // ------------------------------------------------------------------------------------- + return stats; +} +// ------------------------------------------------------------------------------------- +map analyzeStrBlock(const Vector &column, Vector &bitmap, u32 start_index, u32 tuple_count, bool print_block = false); +// ------------------------------------------------------------------------------------- +vector> analyzeRelation(Relation &relation); +} \ No newline at end of file diff --git a/benchmarks/analyze_better_blocks/cengine/analysis/StringStats.cpp b/benchmarks/analyze_better_blocks/cengine/analysis/StringStats.cpp new file mode 100644 index 0000000..d64ebcc --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/analysis/StringStats.cpp @@ -0,0 +1,205 @@ +//#include "Units.hpp" +//#include "MMapvector.hpp" +//// ------------------------------------------------------------------------------------- +//#include "gflags/gflags.h" +//// ------------------------------------------------------------------------------------- +//#include +//#include +//#include +//#include +//// ------------------------------------------------------------------------------------- +//DEFINE_bool(print_header, false, "."); +//DEFINE_string(in, "", "."); +//DEFINE_string(out_csv, "", "."); +//DEFINE_string(delimiter, "\t", "."); +//DEFINE_uint32(max_char_per_cell, 100, "."); +//DECLARE_uint32(block_print_length); +//DECLARE_uint32(block_count); +//DECLARE_uint32(block_length); +//using namespace std; +//// ------------------------------------------------------------------------------------- +//// ------------------------------------------------------------------------------------- +//map analyzeBlock(Vector &column, Vector &bitmap, u32 start_index, u32 tuple_count, bool print_block = false) +//{ +// map stats; +// stats["random_element"] = "-"; +// bool is_starting_values_initialized = false; +// u32 min_length = column[start_index].length(), max_length = column[start_index].length(); +// u32 null_count = 0; +// u32 zero_count = 0; +// std::unordered_map frequency; +// u64 sum_length = 0; +// +// for ( u32 tuple_i = start_index; tuple_i < start_index + tuple_count; tuple_i++ ) { +// BITMAP is_set = bitmap.data[tuple_i]; +// +// if ( !is_set ) { +// null_count++; +// continue; +// } +// +// auto current_value = column[tuple_i]; +// if ( current_value.size() == 0 ) { +// zero_count++; +// } +// +// if ( frequency.find(current_value) == frequency.end()) { +// frequency.insert({current_value, 1}); +// } else { +// frequency[current_value] = frequency[current_value] + 1; +// } +// +// if ( is_starting_values_initialized ) { +// if ( current_value.length() > max_length ) +// max_length = current_value.length(); +// if ( current_value.length() < min_length ) +// min_length = current_value.length(); +// } else { +// is_starting_values_initialized = true; +// min_length = max_length = current_value.length(); +// stats["random_element"] = current_value; +// } +// +// sum_length += current_value.length(); +// } +// const u32 set_count = tuple_count - null_count; +// const u32 unique_count = frequency.size(); +// { +// using Comparator = function, pair)>; +// // Defining a lambda function to compare two pairs. It will compare two pairs using second field +// Comparator compFunctor = +// [](pair elem1, pair elem2) { +// return elem1.second > elem2.second; +// }; +// // Declaring a set that will store the pairs using above comparision logic +// set, Comparator> frequency_set(frequency.begin(), frequency.end(), compFunctor); +// u32 top_i = 1; +// for ( const auto &element: frequency_set ) { +// str value = element.first; +// double frequency = static_cast(element.second) * 100.0 / static_cast(set_count); +// string key_prefix = "top_" + to_string(top_i); +// string value_key = key_prefix + "_value"; +// string percent_key = key_prefix + "_percent"; +// +// stats[value_key] = value; +// stats[percent_key] = to_string(frequency); +// if ( top_i++ == ((unique_count >= 3) ? 3 : unique_count)) { +// break; +// } +// } +// for ( ; top_i <= 3; top_i++ ) { +// string key_prefix = "top_" + to_string(top_i); +// string value_key = key_prefix + "_value"; +// string percent_key = key_prefix + "_percent"; +// +// stats[value_key] = ""; +// stats[percent_key] = ""; +// } +// } +// float average_length = sum_length / tuple_count; +// +// if(is_starting_values_initialized) { +// stats["min"] = to_string(min_length); +// stats["max"] = to_string(max_length); +// } else { +// stats["min"] = "-"; +// stats["max"] = "-"; +// } +// stats["null_count"] = to_string(null_count); +// stats["zero_count"] = to_string(zero_count); +// stats["unique_count"] = to_string(unique_count); +// stats["average_length"] = to_string(average_length); +// // ------------------------------------------------------------------------------------- +// if ( print_block ) { +// string block_rep = ""; +// for ( u32 tuple_i = start_index + 1; tuple_i < start_index + FLAGS_block_print_length; tuple_i++ ) { +// BITMAP is_set = bitmap.data[tuple_i]; +// if ( !is_set ) { +// block_rep += "N"; +// } else if ( column[tuple_i] == column[tuple_i - 1] ) { +// block_rep += "."; +// } else { +// block_rep += "x"; +// } +// } +// stats["block"] = block_rep; +// } +// // ------------------------------------------------------------------------------------- +// return stats; +//} +//int main(int argc, char **argv) +//{ +// srand(time(NULL)); +// // ------------------------------------------------------------------------------------- +// gflags::SetUsageMessage("CSV Dataset parser"); +// gflags::ParseCommandLineFlags(&argc, &argv, true); +// // ------------------------------------------------------------------------------------- +// assert(FLAGS_out_csv.size()); +// string data_file = FLAGS_in, bitmap_file; +// { +// std::regex re("(.*).string"); +// std::smatch match; +// if ( std::regex_search(data_file, match, re) && match.size() > 1 ) { +// bitmap_file = match.str(1) + ".bitmap"; +// } +// } +// // ------------------------------------------------------------------------------------- +// Vector column; +// column.readBinary(data_file.c_str()); +// Vector bitmap; +// bitmap.readBinary(bitmap_file.c_str()); +// auto tuple_count = bitmap.size(); +// assert(bitmap.size() == column.size()); +// // ------------------------------------------------------------------------------------- +// map stats; +// { +// std::regex re("(\\/[^\\/]+\\/[^\\/]+).string"); +// std::smatch match; +// if ( std::regex_search(data_file, match, re) && match.size() > 1 ) { +// stats["col_id"] = match.str(1); +// } +// } +// // ------------------------------------------------------------------------------------- +// auto whole_column = analyzeBlock(column, bitmap, 0, tuple_count); +// for ( const auto &element: whole_column ) { +// stats["col_" + element.first] = element.second; +// } +// // ------------------------------------------------------------------------------------- +// for ( u32 block_i = 1; block_i <= FLAGS_block_count; block_i++ ) { +// u32 start_index = rand() % tuple_count; +// u32 block_length = std::min(FLAGS_block_length, static_cast(tuple_count - start_index)); +// auto block = analyzeBlock(column, bitmap, start_index, block_length, true); +// for ( const auto &element: block ) { +// stats["block_" + to_string(block_i) + "_" + element.first] = element.second; +// } +// } +// +// std::ofstream csv; +// csv.open(FLAGS_out_csv, std::ofstream::out | std::ofstream::app); +// assert(csv.good()); +// if ( csv.tellp() == 0 ) { +// for ( auto it = stats.begin(); it != stats.end(); ) { +// csv << it->first; +// if ( ++it != stats.end()) { +// csv << FLAGS_delimiter; +// } +// } +// csv << endl; +// } +// for ( auto it = stats.begin(); it != stats.end(); ) { +// auto sub_str = it->second.substr(0, FLAGS_max_char_per_cell); +// std::regex tabs_regex("\\t"); +// std::regex nl_regex("\\n"); +// auto sterilized_value = std::regex_replace(sub_str, tabs_regex, " "); +// sterilized_value = std::regex_replace(sterilized_value, nl_regex, " "); +// +// csv << sterilized_value; +// if ( ++it != stats.end()) { +// csv << FLAGS_delimiter; +// } +// } +// csv << endl; +// // ------------------------------------------------------------------------------------- +// return 0; +//} +//// ------------------------------------------------------------------------------------- diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/BtrReader.cpp b/benchmarks/analyze_better_blocks/cengine/datablock/BtrReader.cpp new file mode 100644 index 0000000..233a00a --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/BtrReader.cpp @@ -0,0 +1,212 @@ +// +// Created by david on 18.04.22. +// + +#include "BtrReader.hpp" + +#include +#include +#include +#include "spdlog/spdlog.h" +#include "datablock/schemes/v2/bitmap/RoaringBitmap.hpp" +#include "datablock/schemes/CSchemePicker.hpp" + +#define die_if(expr) if (!(expr)) { perror(#expr); assert(false); } + +namespace cengine::db { + +BtrReader::BtrReader(void *data) : data(data) { + this->m_bitmap_wrappers = std::vector(this->getChunkCount(), nullptr); + this->m_bitsets = std::vector *>(this->getChunkCount(), nullptr); +} + +BtrReader::~BtrReader() { + // bitsets are deleted by BitmapWrapper deconstructor + // (unless we release the bitmap first in which case the wrapper is deleted, but the bitset may still exist) + for (std::size_t i = 0; i < this->m_bitmap_wrappers.size(); i++) { + BitmapWrapper *wrapper = this->m_bitmap_wrappers[i]; + if (wrapper == nullptr) { + delete this->m_bitsets[i]; + } else { + delete wrapper; + } + } +} + +bool BtrReader::readColumn(std::vector &output_chunk_v, u32 index) { + // Fetch metadata for column + auto meta = this->getChunkMetadata(index); + + // Get a pointer to the beginning of the memory area with the data + auto input_data = static_cast(meta->data); + + // Decompress bitmap + u32 tuple_count = meta->tuple_count; + BitmapWrapper *bitmap = this->getBitmap(index); + + auto output_chunk = get_data(output_chunk_v, this->getDecompressedSize(index) + SIMD_EXTRA_BYTES); + bool requires_copy = false; + // Decompress data + switch (meta->type) { + case ColumnType::INTEGER: { + // Prepare destination array + auto destination_array = reinterpret_cast(output_chunk); + + // Fetch the scheme from metadata + auto &scheme = IntegerSchemePicker::MyTypeWrapper::getScheme(meta->compression_type); + scheme.decompress(destination_array, bitmap, input_data, tuple_count, 0); + break; + } + case ColumnType::DOUBLE: { + // Prepare destination array + auto destination_array = reinterpret_cast(output_chunk); + + auto &scheme = DoubleSchemePicker::MyTypeWrapper::getScheme(meta->compression_type); + scheme.decompress(destination_array, bitmap, input_data, tuple_count, 0); + break; + } + case ColumnType::STRING: { + auto &scheme = StringSchemePicker::MyTypeWrapper::getScheme(meta->compression_type); + requires_copy = scheme.decompressNoCopy(output_chunk, bitmap, input_data, tuple_count, 0); + break; + } + default: { + throw Generic_Exception("Type " + ConvertTypeToString(meta->type) + " not supported"); + } + } + + return requires_copy; +} + +string BtrReader::getSchemeDescription(u32 index) { + auto meta = this->getChunkMetadata(index); + u8 compression = meta->compression_type; + auto src = static_cast(meta->data); + + switch (meta->type) { + case ColumnType::INTEGER: { + auto &scheme = IntegerSchemePicker::MyTypeWrapper::getScheme(compression); + return scheme.fullDescription(src); + } + case ColumnType::DOUBLE: { + auto &scheme = DoubleSchemePicker::MyTypeWrapper::getScheme(compression); + return scheme.fullDescription(src); + } + case ColumnType::STRING: { + auto &scheme = StringSchemePicker::MyTypeWrapper::getScheme(compression); + return scheme.fullDescription(src); + } + default: throw Generic_Exception("Type " + ConvertTypeToString(meta->type) + " not supported"); + } +} + +string BtrReader::getBasicSchemeDescription(u32 index) { + // Only print the first level of the scheme description instead of all of them + auto meta = this->getChunkMetadata(index); + u8 compression = meta->compression_type; + auto src = static_cast(meta->data); + + switch (meta->type) { + case ColumnType::INTEGER: { + auto &scheme = IntegerSchemePicker::MyTypeWrapper::getScheme(compression); + return scheme.selfDescription(); + } + case ColumnType::DOUBLE: { + auto &scheme = DoubleSchemePicker::MyTypeWrapper::getScheme(compression); + return scheme.selfDescription(); + } + case ColumnType::STRING: { + auto &scheme = StringSchemePicker::MyTypeWrapper::getScheme(compression); + return scheme.selfDescription(src); + } + default: throw Generic_Exception("Type " + ConvertTypeToString(meta->type) + " not supported"); + } +} + +// TODO make the bitset thread local +BitmapWrapper *BtrReader::getBitmap(u32 index) { + if (this->m_bitmap_wrappers[index] != nullptr) { + return this->m_bitmap_wrappers[index]; + } + + auto meta = this->getChunkMetadata(index); + auto type = meta->nullmap_type; + // Allocate bitset if it's not yet there + if (this->m_bitsets[index] == nullptr && type != BitmapType::ALLONES && type != BitmapType::ALLZEROS) { + this->m_bitsets[index] = new boost::dynamic_bitset<>(meta->tuple_count); + } + // TODO if there are too many page fault try to do the allocation of the bitset inside the object beforehand + this->m_bitmap_wrappers[index] = new BitmapWrapper( + meta->data + meta->nullmap_offset, + type, + meta->tuple_count, + this->m_bitsets[index] + ); + return this->m_bitmap_wrappers[index]; +} + +void BtrReader::releaseBitmap(u32 index) { + if (this->m_bitmap_wrappers[index] == nullptr) { + return; + } + this->m_bitmap_wrappers[index]->releaseBitset(); + delete this->m_bitmap_wrappers[index]; + this->m_bitmap_wrappers[index] = nullptr; +} + + +BitmapWrapper * BtrReader::releaseBitmapOwnership(u32 index) { + BitmapWrapper *ret = this->m_bitmap_wrappers[index]; + this->m_bitmap_wrappers[index] = nullptr; + this->m_bitsets[index] = nullptr; + return ret; +} + +u32 BtrReader::getDecompressedSize(u32 index) { + auto meta = this->getChunkMetadata(index); + + switch (meta->type) { + case ColumnType::INTEGER: { + return sizeof(INTEGER) * meta->tuple_count; + } + case ColumnType::DOUBLE: { + return sizeof(DOUBLE) * meta->tuple_count; + } + case ColumnType::STRING: { + auto &scheme = StringSchemePicker::MyTypeWrapper::getScheme(meta->compression_type); + + auto input_data = static_cast(meta->data); + BitmapWrapper *bitmapWrapper = this->getBitmap(index); + u32 size = scheme.getDecompressedSizeNoCopy(input_data, meta->tuple_count, bitmapWrapper); + // TODO The 4096 is temporary until I figure out why FSST is returning bigger numbers + return size + 8 + 4096; // +8 because of fsst decompression + } + default: { + throw Generic_Exception("Type " + ConvertTypeToString(this->getColumnType()) + " not supported"); + } + } +} + +u32 BtrReader::getDecompressedDataSize(u32 index) { + auto meta = this->getChunkMetadata(index); + switch (meta->type) { + case ColumnType::INTEGER: { + return sizeof(INTEGER) * meta->tuple_count; + } + case ColumnType::DOUBLE: { + return sizeof(DOUBLE) * meta->tuple_count; + } + case ColumnType::STRING: { + auto &scheme = StringSchemePicker::MyTypeWrapper::getScheme(meta->compression_type); + + auto input_data = static_cast(meta->data); + BitmapWrapper *bitmapWrapper = this->getBitmap(index); + u32 size = scheme.getTotalLength(input_data, meta->tuple_count, bitmapWrapper); + return size; + } + default: { + throw Generic_Exception("Type " + ConvertTypeToString(meta->type) + " not supported"); + } + } +} +} diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/BtrReader.hpp b/benchmarks/analyze_better_blocks/cengine/datablock/BtrReader.hpp new file mode 100644 index 0000000..99bde1f --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/BtrReader.hpp @@ -0,0 +1,49 @@ +// +// Created by david on 18.04.22. +// + +#pragma once + +#include "datablock/Datablock.hpp" +#include + +namespace cengine::db { + +class BtrReader { +public: + explicit BtrReader(void *data); + virtual ~BtrReader(); + bool readColumn(std::vector &output_chunk, u32 index); + [[nodiscard]] string getSchemeDescription(u32 index); + [[nodiscard]] string getBasicSchemeDescription(u32 index); + + [[nodiscard]] u32 getDecompressedSize(u32 index); + // Only the whole decompressed data without the btr specific metadata around it. + [[nodiscard]] u32 getDecompressedDataSize(u32 index); + [[nodiscard]] BitmapWrapper *getBitmap(u32 index); + void releaseBitmap(u32 index); + BitmapWrapper * releaseBitmapOwnership(u32 index); + + [[nodiscard]] inline const ColumnPartMetadata *getPartMetadata() { + return reinterpret_cast(this->data); + } + [[nodiscard]] inline const ColumnChunkMeta *getChunkMetadata(u32 index) { + u32 offset = this->getPartMetadata()->offsets[index]; + return reinterpret_cast(reinterpret_cast(this->data) + offset); + } + [[nodiscard]] inline u32 getTupleCount(u32 index) { + return this->getChunkMetadata(index)->tuple_count; + } + [[nodiscard]] inline ColumnType getColumnType() { + return this->getChunkMetadata(0)->type; + } + [[nodiscard]] inline u32 getChunkCount() { + return this->getPartMetadata()->num_chunks; + } +private: + void *data{}; + std::vector m_bitmap_wrappers; + std::vector *> m_bitsets; +}; + +} diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/CMachine.hpp b/benchmarks/analyze_better_blocks/cengine/datablock/CMachine.hpp new file mode 100644 index 0000000..08ed947 --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/CMachine.hpp @@ -0,0 +1,32 @@ +#pragma once +#include "Units.hpp" +#include "storage/Chunk.hpp" +#include "storage/Relation.hpp" +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +namespace cengine { +// ------------------------------------------------------------------------------------- +struct OutputBlockStats { + vector data_sizes; + vector nullmap_sizes; + vector used_compression_schemes; + // ------------------------------------------------------------------------------------- + // Aux + SIZE total_data_size; + SIZE total_nullmap_size; + SIZE total_db_size; + double compression_ratio; //before /old data size +}; +// ------------------------------------------------------------------------------------- +class CMachine { +protected: + const Relation &relation; +public: + CMachine(const Relation &relation) + : relation(relation) {} + + virtual OutputBlockStats compress(const Chunk &input_chunk, BytesArray &output_block) = 0; + virtual cengine::Chunk decompress(const BytesArray &input_block) = 0; +}; +} +// ------------------------------------------------------------------------------------- \ No newline at end of file diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/Datablock.cpp b/benchmarks/analyze_better_blocks/cengine/datablock/Datablock.cpp new file mode 100644 index 0000000..a8621ff --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/Datablock.cpp @@ -0,0 +1,405 @@ +#include "Datablock.hpp" +#include "schemes/CScheme.hpp" +#include "storage/Chunk.hpp" +#include "datablock/schemes/CSchemePool.hpp" +#include "datablock/schemes/CSchemePicker.hpp" +#include "datablock/cache/ThreadCache.hpp" +// ------------------------------------------------------------------------------------- +#include "datablock/schemes/v1/integer/OneValue.hpp" +#include "datablock/schemes/v1/integer/Truncation.hpp" +#include "datablock/schemes/v1/integer/Dictionary.hpp" +#include "datablock/schemes/v1/integer/Uncompressed.hpp" +#include "datablock/schemes/v1/double/OneValue.hpp" +#include "datablock/schemes/v1/double/Uncompressed.hpp" +#include "datablock/schemes/v1/string/Dictionary.hpp" +#include "datablock/schemes/v1/string/OneValue.hpp" +#include "datablock/schemes/v1/string/Uncompressed.hpp" +// ------------------------------------------------------------------------------------- +#include "datablock/schemes/v2/string/DynamicDictionary.hpp" +#include "datablock/schemes/v2/bitmap/RoaringBitmap.hpp" +// ------------------------------------------------------------------------------------- +#include "gflags/gflags.h" +#include "spdlog/spdlog.h" +// ------------------------------------------------------------------------------------- +#include +#include +#include +#include +#include +#include +#include +// ------------------------------------------------------------------------------------- +DEFINE_uint32(doubles_max_cascading_level, 3, ""); +DEFINE_uint32(integers_max_cascading_level, 3, ""); +DEFINE_uint32(strings_max_cascading_level, 3, ""); +// ------------------------------------------------------------------------------------- +namespace cengine { +namespace db { +// ------------------------------------------------------------------------------------- +Datablock::Datablock(const Relation &relation) + : CMachine(relation) +{ +} +u32 Datablock::writeMetadata(const std::string &path, std::vector types, vector part_counters, + u32 num_chunks) { + std::ofstream metadata_file(path, std::ios::out | std::ios::binary); + if (!metadata_file.good()) { + throw Generic_Exception("Opening metadata output file failed"); + } + + u32 bytes_written = 0; + FileMetadata metadata { + .num_columns = static_cast(types.size()), + .num_chunks = static_cast(num_chunks) + }; + + metadata_file.write(reinterpret_cast(&metadata), sizeof(metadata)); + bytes_written += sizeof(metadata); + + for (u32 column = 0; column < metadata.num_columns; column++) { + ColumnPartInfo info { + .type = types[column], + .num_parts = static_cast(part_counters[column]) + }; + metadata_file.write(reinterpret_cast(&info), sizeof(info)); + bytes_written += sizeof(info); + } + + metadata_file.close(); + return bytes_written; +} +// ------------------------------------------------------------------------------------- +std::vector Datablock::compress(const InputChunk &input_chunk) { + // We do not now the exact output size. Therefore we allocate too much and then simply make the space smaller afterwards + const u32 size = sizeof(ColumnChunkMeta) + 10 * input_chunk.size + sizeof(BITMAP) * input_chunk.tuple_count; + std::vector output(size); + auto meta = reinterpret_cast(output.data()); + meta->tuple_count = input_chunk.tuple_count; + meta->type = input_chunk.type; + + auto output_data = meta->data; + + switch ( input_chunk.type ) { + case ColumnType::INTEGER: { + IntegerSchemePicker::compress(reinterpret_cast(input_chunk.data.get()), input_chunk.nullmap.get(), output_data, input_chunk.tuple_count, FLAGS_integers_max_cascading_level, meta->nullmap_offset, meta->compression_type); + break; + } + case ColumnType::DOUBLE : { + // ------------------------------------------------------------------------------------- + DoubleSchemePicker::compress(reinterpret_cast(input_chunk.data.get()), input_chunk.nullmap.get(), output_data, input_chunk.tuple_count, FLAGS_doubles_max_cascading_level, meta->nullmap_offset, meta->compression_type); + // ------------------------------------------------------------------------------------- + break; + } + case ColumnType::STRING : { + // ------------------------------------------------------------------------------------- + // Collect stats + StringStats stats = StringStats::generateStats(StringArrayViewer(input_chunk.data.get()), input_chunk.nullmap.get(), input_chunk.tuple_count, input_chunk.size); + // ------------------------------------------------------------------------------------- + // Make decisions + StringScheme &preferred_scheme = StringSchemePicker::chooseScheme(stats, FLAGS_strings_max_cascading_level); + // Update meta data + meta->compression_type = static_cast(preferred_scheme.schemeType()); + // ------------------------------------------------------------------------------------- + // Compression + ThreadCache::get().compression_level++; + const StringArrayViewer str_viewer(input_chunk.data.get()); + u32 after_column_size = preferred_scheme.compress(str_viewer, input_chunk.nullmap.get(), output_data, stats); + meta->nullmap_offset = after_column_size; + // ------------------------------------------------------------------------------------- + for ( u8 i = 0; i < 5 - FLAGS_strings_max_cascading_level; i++ ) { + ThreadCache::get() << "\t"; + } + ThreadCache::get() << "for : ? - scheme = " + ConvertSchemeTypeToString(preferred_scheme.schemeType()); + +" before = " + std::to_string(stats.total_size) + + " after = " + std::to_string(after_column_size) + + " gain = " + std::to_string(CD(stats.total_size) / CD(after_column_size)) + + '\n'; + + double estimated_cf = CD(stats.total_size) / CD(after_column_size); // we only have one scheme for strings (beside ONE_VALUE) + ThreadCache::dumpPush(ConvertSchemeTypeToString(preferred_scheme.schemeType()) + (ThreadCache::hasUsedFsst() ? "_FSST" : ""), + estimated_cf, + stats.total_size, after_column_size, + stats.unique_count, "?"); + ThreadCache::get().compression_level--; + // ------------------------------------------------------------------------------------- + break; + } + default: + throw Generic_Exception("Type not supported"); + } + + // Compress bitmap + auto [nullmap_size, bitmap_type] = v2::bitmap::RoaringBitmap::compress(input_chunk.nullmap.get(), output_data + meta->nullmap_offset, input_chunk.tuple_count); + meta->nullmap_type = bitmap_type; + u32 total_size = sizeof(*meta) + meta->nullmap_offset + nullmap_size; + // Resize the output vector to the actual used size + output.resize(total_size); + + // Print decision tree + ThreadCache::get() << " type = " + ConvertTypeToString(input_chunk.type) + + " before = " + std::to_string(input_chunk.size) + + " after = " + std::to_string(total_size) + + " gain = " + std::to_string(CD(input_chunk.size) / CD(total_size)) + + "\n\n\n"; + return output; +} + +bool Datablock::decompress(const u8 *data_in, BitmapWrapper **bitmap_out, u8 *data_out) { + // TODO this code is unused + auto meta = reinterpret_cast(data_in); + + // Decompress bitmap + *bitmap_out = new BitmapWrapper( + meta->data + meta->nullmap_offset, + meta->nullmap_type, + meta->tuple_count + ); + + // Decompress data, we assume that data_out already has the correct size allocated + bool requires_copy_out; + switch ( meta->type ) { + case ColumnType::INTEGER : { + auto &scheme = CSchemePool::available_schemes->integer_schemes[static_cast(meta->compression_type)]; + scheme->decompress(reinterpret_cast(data_out), *bitmap_out, + meta->data, meta->tuple_count, 0); + requires_copy_out = false; + break; + } + case ColumnType::DOUBLE : { + auto &scheme = CSchemePool::available_schemes->double_schemes[static_cast(meta->compression_type)]; + scheme->decompress(reinterpret_cast(data_out), *bitmap_out, meta->data, meta->tuple_count, 0); + requires_copy_out = false; + break; + } + case ColumnType::STRING : { + auto &scheme = CSchemePool::available_schemes->string_schemes[static_cast(meta->compression_type)]; + requires_copy_out = scheme->decompressNoCopy(data_out, *bitmap_out, meta->data, meta->tuple_count, 0); + break; + } + default: + throw Generic_Exception("Type not supported"); + } + + return requires_copy_out; +} + +OutputBlockStats Datablock::compress(const Chunk &input_chunk, BytesArray &output_block) +{ + // ------------------------------------------------------------------------------------- + const u32 db_meta_buffer_size = sizeof(DatablockMeta) + (relation.columns.size() * sizeof(ColumnMeta)); + u32 input_chunk_total_data_size = 0; + // Reserve memory for output (datablock) + if ( !output_block ) { + for ( u32 column_i = 0; column_i < relation.columns.size(); column_i++ ) { + input_chunk_total_data_size += input_chunk.size(column_i); + } + const u32 output_block_size = (db_meta_buffer_size + input_chunk_total_data_size) * 10 + (relation.columns.size() * sizeof(BITMAP) * input_chunk.tuple_count); + output_block = makeBytesArray(output_block_size); + } + // ------------------------------------------------------------------------------------- + assert(output_block); + // ------------------------------------------------------------------------------------- + auto db_meta = reinterpret_cast(output_block.get()); + db_meta->count = input_chunk.tuple_count; + db_meta->column_count = relation.columns.size(); + // ------------------------------------------------------------------------------------- + OutputBlockStats output_db_stats = { + .data_sizes = vector(relation.columns.size(), 0) + , .nullmap_sizes = vector(relation.columns.size(), 0) + , .used_compression_schemes = vector(relation.columns.size(), 255) + , . total_data_size = 0 + , .total_nullmap_size = 0 + , .total_db_size = db_meta_buffer_size}; // we are going to determine the size during columns analysis + u32 db_write_offset = db_meta_buffer_size; + // ------------------------------------------------------------------------------------- + u32 after_column_size = 0; + for ( u32 column_i = 0; column_i < relation.columns.size(); column_i++ ) { + auto &column = relation.columns[column_i]; + auto &column_meta = db_meta->attributes_meta[column_i]; + column_meta.column_type = column.type; + // ------------------------------------------------------------------------------------- + spdlog::info("DB: compressing column : {}", column.name); + ThreadCache::dumpSet(relation.name, column.name, ConvertTypeToString(column.type)); + // ------------------------------------------------------------------------------------- + switch ( column.type ) { + case ColumnType::INTEGER: { + // First: apply FOR to remove negative numbers and decrease the range + // ------------------------------------------------------------------------------------- + const BITMAP *nullmap = input_chunk.nullmap(column_i); + // ------------------------------------------------------------------------------------- + // Compression + IntegerSchemePicker::compress(input_chunk.array(column_i), nullmap, output_block.get() + db_write_offset, input_chunk.tuple_count, FLAGS_integers_max_cascading_level, after_column_size, column_meta.compression_type); + after_column_size += sizeof(column_meta.bias); + // ------------------------------------------------------------------------------------- + break; + } + case ColumnType::DOUBLE : { + // ------------------------------------------------------------------------------------- + DoubleSchemePicker::compress(input_chunk.array(column_i), input_chunk.nullmap(column_i), output_block.get() + db_write_offset, input_chunk.tuple_count, FLAGS_doubles_max_cascading_level, after_column_size, column_meta.compression_type); + // ------------------------------------------------------------------------------------- + break; + } + case ColumnType::STRING : { + // ------------------------------------------------------------------------------------- + // Collect stats + StringStats stats = StringStats::generateStats(StringArrayViewer(input_chunk.array(column_i)), input_chunk.nullmap(column_i), input_chunk.tuple_count, input_chunk.size(column_i)); + // ------------------------------------------------------------------------------------- + // Make decisions + StringScheme &preferred_scheme = StringSchemePicker::chooseScheme(stats, FLAGS_strings_max_cascading_level); + // Update meta data + column_meta.compression_type = static_cast(preferred_scheme.schemeType()); + // ------------------------------------------------------------------------------------- + // Compression + ThreadCache::get().compression_level++; + const StringArrayViewer str_viewer(input_chunk.array(column_i)); + after_column_size = preferred_scheme.compress(str_viewer, input_chunk.nullmap(column_i), output_block.get() + db_write_offset, stats); + // ------------------------------------------------------------------------------------- + for ( u8 i = 0; i < 5 - FLAGS_strings_max_cascading_level; i++ ) { + ThreadCache::get() << "\t"; + } + ThreadCache::get() << "for : ? - scheme = " + ConvertSchemeTypeToString(preferred_scheme.schemeType()); + +" before = " + std::to_string(stats.total_size) + + " after = " + std::to_string(after_column_size) + + " gain = " + std::to_string(CD(stats.total_size) / CD(after_column_size)) + + '\n'; + + double estimated_cf = CD(stats.total_size) / CD(after_column_size); // we only have one scheme for strings (beside ONE_VALUE) + ThreadCache::dumpPush(ConvertSchemeTypeToString(preferred_scheme.schemeType()) + (ThreadCache::hasUsedFsst() ? "_FSST" : ""), + estimated_cf, + stats.total_size, after_column_size, + stats.unique_count, "?"); + ThreadCache::get().compression_level--; + // ------------------------------------------------------------------------------------- + break; + } + default: + throw Generic_Exception("Type not supported"); + } + // ------------------------------------------------------------------------------------- + // Update offsets + column_meta.offset = db_write_offset; + db_write_offset += after_column_size; + // ------------------------------------------------------------------------------------- + // Compress bitmap + column_meta.nullmap_offset = db_write_offset; + auto [nullmap_size, bitmap_type] = v2::bitmap::RoaringBitmap::compress(input_chunk.nullmap(column_i), output_block.get() + db_write_offset, input_chunk.tuple_count); + column_meta.bitmap_type = bitmap_type; + db_write_offset += nullmap_size; + output_db_stats.nullmap_sizes[column_i] = nullmap_size; + // ------------------------------------------------------------------------------------- + // Update output db stats + output_db_stats.used_compression_schemes[column_i] = column_meta.compression_type; + output_db_stats.data_sizes[column_i] = after_column_size; + // ------------------------------------------------------------------------------------- + // Print decision tree + ThreadCache::get() << "name = " + column.name + " type = " + ConvertTypeToString(column.type) + + " before = " + std::to_string(input_chunk.size(column_i)) + + " after = " + std::to_string(after_column_size) + + " gain = " + std::to_string(CD(input_chunk.size(column_i)) / CD(after_column_size)) + + "\n\n\n"; + } + // ------------------------------------------------------------------------------------- + // We don't really have to calculate total size and compression ratio here :-S + for ( u32 column_i = 0; column_i < relation.columns.size(); column_i++ ) { + output_db_stats.total_data_size += output_db_stats.data_sizes[column_i]; + output_db_stats.total_nullmap_size += output_db_stats.nullmap_sizes[column_i]; + } + output_db_stats.total_db_size += output_db_stats.total_nullmap_size + output_db_stats.total_data_size; + assert(db_write_offset <= output_db_stats.total_db_size); + output_db_stats.compression_ratio = static_cast(input_chunk_total_data_size) / static_cast(output_db_stats.total_data_size); + // ------------------------------------------------------------------------------------- + db_meta->size = output_db_stats.total_db_size; + // ------------------------------------------------------------------------------------- + return output_db_stats; +} +// ------------------------------------------------------------------------------------- +cengine::Chunk Datablock::decompress(const BytesArray &input_db) +{ + // TODO this function shoiuld not rely on the presence of the relation + auto db_meta = reinterpret_cast(input_db.get()); + const u32 tuple_count = db_meta->count; + // ------------------------------------------------------------------------------------- + auto columns = std::unique_ptr[]>(new std::unique_ptr[relation.columns.size()]); + std::unique_ptr column_requires_copy(new bool[relation.columns.size()]); + auto bitmaps = std::unique_ptr[]>(new std::unique_ptr[relation.columns.size()]); + auto sizes = std::unique_ptr(new size_t[relation.columns.size()]); + // ------------------------------------------------------------------------------------- + for ( u32 column_i = 0; column_i < relation.columns.size(); column_i++ ) { + auto &column = relation.columns[column_i]; + auto &column_meta = db_meta->attributes_meta[column_i]; + bitmaps[column_i] = std::unique_ptr(new BITMAP[tuple_count]); + // ------------------------------------------------------------------------------------- + // Decompress bitmap if necessary + auto bitmap = BitmapWrapper(reinterpret_cast(input_db.get() + column_meta.nullmap_offset), column_meta.bitmap_type, tuple_count); + bitmap.writeBITMAP(bitmaps[column_i].get()); + // ------------------------------------------------------------------------------------- + switch ( column.type ) { + case ColumnType::INTEGER : { + // ------------------------------------------------------------------------------------- + sizes[column_i] = sizeof(INTEGER) * tuple_count; + columns[column_i] = makeBytesArray(sizeof(INTEGER) * tuple_count + SIMD_EXTRA_BYTES); + // ------------------------------------------------------------------------------------- + auto destination_array = reinterpret_cast(columns[column_i].get()); + auto &scheme = IntegerSchemePicker::MyTypeWrapper::getScheme(column_meta.compression_type); + // ------------------------------------------------------------------------------------- + scheme.decompress(destination_array, &bitmap, + input_db.get() + column_meta.offset, tuple_count, 0); + column_requires_copy[column_i] = false; + break; + } + case ColumnType::DOUBLE : { + // ------------------------------------------------------------------------------------- + sizes[column_i] = sizeof(DOUBLE) * tuple_count; + columns[column_i] = makeBytesArray(sizeof(DOUBLE) * tuple_count + SIMD_EXTRA_BYTES); + // ------------------------------------------------------------------------------------- + auto column_dest_double_array = reinterpret_cast(columns[column_i].get()); + const auto used_compression_scheme = static_cast(column_meta.compression_type); + auto &scheme = CSchemePool::available_schemes->double_schemes[used_compression_scheme]; + // ------------------------------------------------------------------------------------- + scheme->decompress(column_dest_double_array, &bitmap, input_db.get() + column_meta.offset, tuple_count, 0); + column_requires_copy[column_i] = false; + break; + } + case ColumnType::STRING : { + // ------------------------------------------------------------------------------------- + const auto used_compression_scheme = static_cast(column_meta.compression_type); + auto &scheme = CSchemePool::available_schemes->string_schemes[used_compression_scheme]; + // ------------------------------------------------------------------------------------- + sizes[column_i] = scheme->getDecompressedSizeNoCopy( + input_db.get() + column_meta.offset, + tuple_count, + &bitmap); + // TODO The 4096 is temporary until I figure out why FSST is returning bigger numbers + columns[column_i] = makeBytesArray(sizes[column_i] + 8 + SIMD_EXTRA_BYTES + 4096); // +8 because of 8 fsst decompression + // ------------------------------------------------------------------------------------- + column_requires_copy[column_i] = scheme->decompressNoCopy(columns[column_i].get(), &bitmap, input_db.get() + column_meta.offset, tuple_count, 0); + break; + } + default: + throw Generic_Exception("Type not supported"); + break; + } + } + // ------------------------------------------------------------------------------------- + return Chunk(std::move(columns), std::move(bitmaps), std::move(column_requires_copy), tuple_count, relation, std::move(sizes)); +} +// ------------------------------------------------------------------------------------- +void Datablock::getCompressedColumn(const BytesArray &input_db, u32 col_i, u8 *&ptr, u32 &size) +{ + auto db_meta = reinterpret_cast(input_db.get()); + ptr = input_db.get() + db_meta->attributes_meta[col_i].offset; + if ( col_i == relation.columns.size() - 1 ) { + size = db_meta->size - db_meta->attributes_meta[col_i].offset; + } else { + size = db_meta->attributes_meta[col_i + 1].offset - db_meta->attributes_meta[col_i].offset; + } +} +// ------------------------------------------------------------------------------------- +void CSchemePool::refresh() +{ + cengine::db::CSchemePool::available_schemes = make_unique(); +} +// ------------------------------------------------------------------------------------- +} +} +// ------------------------------------------------------------------------------------- diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/Datablock.hpp b/benchmarks/analyze_better_blocks/cengine/datablock/Datablock.hpp new file mode 100644 index 0000000..5e6c341 --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/Datablock.hpp @@ -0,0 +1,73 @@ +#pragma once +#include "storage/Chunk.hpp" +#include "datablock/CMachine.hpp" +#include "datablock/schemes/CScheme.hpp" +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +#include +// ------------------------------------------------------------------------------------- +namespace cengine { +namespace db { + +// Begin new chunking +struct ColumnChunkMeta { + u8 compression_type; + BitmapType nullmap_type; + ColumnType type; + // There is 1 unused Bytes here. + u32 nullmap_offset = 0; + u32 tuple_count; + u8 data[]; +}; +static_assert(sizeof(ColumnChunkMeta) == 12); + +struct ColumnPartInfo { + ColumnType type; + // There are 3 unused bytes here. + u32 num_parts; +}; +static_assert(sizeof(ColumnPartInfo) == 8); + +struct FileMetadata { + u32 num_columns; + u32 num_chunks; + struct ColumnPartInfo parts[]; +}; +static_assert(sizeof(FileMetadata) == 8); +// End new chunking +struct __attribute__ ((packed)) ColumnMeta { + u8 compression_type; + ColumnType column_type; + BitmapType bitmap_type; + u8 padding; + u32 offset; + s32 bias; + u32 nullmap_offset = 0; +}; +static_assert(sizeof(ColumnMeta) == 16, ""); +// ------------------------------------------------------------------------------------- +struct DatablockMeta { + u32 count; + u32 size; + u32 column_count; + u32 padding; + ColumnMeta attributes_meta[]; +}; +static_assert(sizeof(DatablockMeta) == 16, ""); +// ------------------------------------------------------------------------------------- +class Datablock : public CMachine { +public: + Datablock(const Relation &relation); + virtual OutputBlockStats compress(const Chunk &input_chunk, BytesArray &output_block); + virtual cengine::Chunk decompress(const BytesArray &input_block); + virtual void getCompressedColumn(const BytesArray &input_db, u32 col_i, u8 *&ptr, u32 &size); + + static bool decompress(const u8 *data_in, BitmapWrapper **bitmap_out, u8 *data_out); + static vector compress(const InputChunk &input_chunk); + static u32 writeMetadata(const std::string &path, std::vector types, vector part_counters, + u32 num_chunks); +}; +// ------------------------------------------------------------------------------------- +} +} +// ------------------------------------------------------------------------------------- \ No newline at end of file diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/cache/ThreadCache.cpp b/benchmarks/analyze_better_blocks/cengine/datablock/cache/ThreadCache.cpp new file mode 100644 index 0000000..62b14fc --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/cache/ThreadCache.cpp @@ -0,0 +1,65 @@ +#include "ThreadCache.hpp" +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +namespace cengine { +namespace db { +tbb::enumerable_thread_specific ThreadCache::data = {}; +ThreadCacheContainer &ThreadCache::get() +{ + return data.local(); +} +// ------------------------------------------------------------------------------------- +bool ThreadCacheContainer::isOnHotPath() +{ + return estimation_level == 0; +} +// ------------------------------------------------------------------------------------- +void ThreadCache::dumpSet(string rel_name, string col_name, string col_type) +{ + get().dump_meta.rel_name = rel_name; + get().dump_meta.col_name = col_name; + get().dump_meta.col_type = col_type; +} +// ------------------------------------------------------------------------------------- +void ThreadCache::dumpPush(string scheme_name, double cf, u32 before_size, u32 after_size, u32 unique_count, string comment) +{ + get().estimation_deviation_csv << get().dump_meta.rel_name << '\t' + << get().dump_meta.col_name << '\t' + << get().dump_meta.col_type << '\t' + << get().dump_meta.chunk_i << '\t' + << get().compression_level << '\t' + << scheme_name << '\t' + << cf << '\t' + << before_size << '\t' + << after_size << '\t' + << CD(before_size) / CD(after_size) << '\t' + << comment << '\t' + << unique_count << '\n'; +} +// ------------------------------------------------------------------------------------- +void ThreadCache::dumpFsst(u32 before_total, u32 before_pool, u32 after_pool, u32 after_total) +{ + get().fsst_csv << get().dump_meta.rel_name << '\t' + << get().dump_meta.col_name << '\t' + << get().dump_meta.chunk_i << '\t' + << before_total << '\t' + << before_pool << '\t' + << after_pool << '\t' + << after_total << '\n'; +} +// ------------------------------------------------------------------------------------- +void ThreadCache::setFsst() +{ + if(get().isOnHotPath()) { + get().fsst = true; + } +} +// ------------------------------------------------------------------------------------- + bool ThreadCache::hasUsedFsst() + { + bool fsst = get().fsst; + get().fsst = false; + return fsst; + } +} +} diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/cache/ThreadCache.hpp b/benchmarks/analyze_better_blocks/cengine/datablock/cache/ThreadCache.hpp new file mode 100644 index 0000000..dbe8fc9 --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/cache/ThreadCache.hpp @@ -0,0 +1,52 @@ +#pragma once +#include "Units.hpp" +// ------------------------------------------------------------------------------------- +#include "tbb/enumerable_thread_specific.h" +// ------------------------------------------------------------------------------------- +#include +// ------------------------------------------------------------------------------------- +namespace cengine { +namespace db { +struct ThreadCacheContainer { + struct DumpMeta { + // rel col_n col_t block_n level_n estimated_cf before after + string rel_name; + string col_name; + string col_type; + u32 chunk_i; + }; + DumpMeta dump_meta; + std::stringstream estimation_deviation_csv; // decision_tree + std::stringstream fsst_csv; // fsst + // ------------------------------------------------------------------------------------- + bool isOnHotPath(); + std::stringstream log; // decision_tree + u16 estimation_level = 0; + u16 compression_level = 0; + // ------------------------------------------------------------------------------------- + bool fsst = false; + // ------------------------------------------------------------------------------------- + std::ostream &operator<<(string str) + { + if ( estimation_level == 0 ) { + return (log << str); + } + return log; + } + // ------------------------------------------------------------------------------------- +}; +class ThreadCache { +public: + static tbb::enumerable_thread_specific data; + static ThreadCacheContainer &get(); + // ------------------------------------------------------------------------------------- + static void dumpSet(string rel_name, string col_name, string col_type); + static void dumpPush(string scheme_name, double cf, u32 before, u32 after, u32 unique_count, string comment = ""); + static void dumpFsst(u32 before_total, u32 before_pool, u32 after_pool, u32 after_total); + // ------------------------------------------------------------------------------------- + static bool hasUsedFsst(); + static void setFsst(); +}; +// ------------------------------------------------------------------------------------- +} +} diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/schemes/CScheme.cpp b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/CScheme.cpp new file mode 100644 index 0000000..ccf641e --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/CScheme.cpp @@ -0,0 +1,47 @@ +#include "CScheme.hpp" +#include "datablock/cache/ThreadCache.hpp" +// ------------------------------------------------------------------------------------- +DEFINE_uint32(sample_size, 64, ""); +DEFINE_uint32(sample_count, 10, ""); +// ------------------------------------------------------------------------------------- +namespace cengine { +namespace db { +// ------------------------------------------------------------------------------------- +double DoubleScheme::expectedCompressionRatio(DoubleStats &stats, u8 allowed_cascading_level) +{ + auto dest = makeBytesArray(FLAGS_sample_size * FLAGS_sample_count * sizeof(DOUBLE) * 100); + u32 total_before = 0; + u32 total_after = 0; + if ( ThreadCache::get().estimation_level++ >= 1 ) { + total_before += stats.total_size; + total_after += compress(stats.src, stats.bitmap, dest.get(), stats, allowed_cascading_level); + } else { + auto sample = stats.samples(FLAGS_sample_count, FLAGS_sample_size); + DoubleStats c_stats = DoubleStats::generateStats(std::get<0>(sample).data(), std::get<1>(sample).data(), std::get<0>(sample).size()); + total_before += c_stats.total_size; + total_after += compress(std::get<0>(sample).data(), std::get<1>(sample).data(), dest.get(), c_stats, allowed_cascading_level); + } + ThreadCache::get().estimation_level--; + return CD(total_before) / CD(total_after); +} +// ------------------------------------------------------------------------------------- +double IntegerScheme::expectedCompressionRatio(SInteger32Stats &stats, u8 allowed_cascading_level) +{ + auto dest = makeBytesArray(FLAGS_sample_size * FLAGS_sample_count * sizeof(INTEGER) * 100); + u32 total_before = 0; + u32 total_after = 0; + if ( ThreadCache::get().estimation_level++ >= 1 ) { + total_before += stats.total_size; + total_after += compress(stats.src, stats.bitmap, dest.get(), stats, allowed_cascading_level); + } else { + auto sample = stats.samples(FLAGS_sample_count, FLAGS_sample_size); + SInteger32Stats c_stats = SInteger32Stats::generateStats(std::get<0>(sample).data(), std::get<1>(sample).data(), std::get<0>(sample).size()); + total_before += c_stats.total_size; + total_after += compress(std::get<0>(sample).data(), std::get<1>(sample).data(), dest.get(), c_stats, allowed_cascading_level); + } + ThreadCache::get().estimation_level--; + return CD(total_before) / CD(total_after); +} +// ------------------------------------------------------------------------------------- +} +} diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/schemes/CScheme.hpp b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/CScheme.hpp new file mode 100644 index 0000000..38ffb22 --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/CScheme.hpp @@ -0,0 +1,147 @@ +#pragma once +#include "Units.hpp" +#include "storage/Chunk.hpp" +#include "storage/Relation.hpp" +#include "storage/StringArrayViewer.hpp" +#include "datablock/stats/NumberStats.hpp" +#include "datablock/stats/StringStats.hpp" +#include "datablock/schemes/v2/bitmap/RoaringBitmap.hpp" +// ------------------------------------------------------------------------------------- +#include "IntegerSchemeType.hpp" +#include "DoubleSchemeType.hpp" +#include "StringSchemeType.hpp" +// ------------------------------------------------------------------------------------- +#include "gflags/gflags.h" +// ------------------------------------------------------------------------------------- +DECLARE_uint32(sample_size); +DECLARE_uint32(sample_count); +// ------------------------------------------------------------------------------------- +namespace cengine { +namespace db { +// ------------------------------------------------------------------------------------- +using UInteger32Stats = NumberStats; +using SInteger32Stats = NumberStats; +using DoubleStats = NumberStats; +// ------------------------------------------------------------------------------------- +struct Predicate { +}; +// ------------------------------------------------------------------------------------- +// expectedCompressionRatio should only be called at top level +class IntegerScheme { +public: + // ------------------------------------------------------------------------------------- + virtual double expectedCompressionRatio(SInteger32Stats &stats, u8 allowed_cascading_level); + // ------------------------------------------------------------------------------------- + virtual u32 compress(const INTEGER *src, const BITMAP *nullmap, u8 *dest, SInteger32Stats &stats, u8 allowed_cascading_level) = 0; + // ------------------------------------------------------------------------------------- + virtual void decompress(INTEGER *dest, BitmapWrapper *nullmap, const u8 *src, u32 tuple_count, u32 level) = 0; + // ------------------------------------------------------------------------------------- + virtual IntegerSchemeType schemeType() = 0; + // ------------------------------------------------------------------------------------- + virtual INTEGER lookup(u32 id) = 0; + // ------------------------------------------------------------------------------------- + virtual void scan(Predicate, BITMAP *result, const u8 *src, u32 tuple_count) = 0; + // ------------------------------------------------------------------------------------- + inline string selfDescription() { + return ConvertSchemeTypeToString(this->schemeType()); + } + virtual string fullDescription(const u8 *) { + // Default implementation for schemes that do not have nested schemes + return this->selfDescription(); + } + virtual bool isUsable(SInteger32Stats &) { + return true; + } +}; +// ------------------------------------------------------------------------------------- +// Double +// ------------------------------------------------------------------------------------- +class DoubleScheme { +public: + // ------------------------------------------------------------------------------------- + virtual double expectedCompressionRatio(DoubleStats &stats, u8 allowed_cascading_level); + // ------------------------------------------------------------------------------------- + virtual u32 compress(const DOUBLE *src, const BITMAP *nullmap, u8 *dest, DoubleStats &stats, u8 allowed_cascading_level) = 0; + // ------------------------------------------------------------------------------------- + virtual void decompress(DOUBLE *dest, BitmapWrapper *bitmap, const u8 *src, u32 tuple_count, u32 level) = 0; + // ------------------------------------------------------------------------------------- + virtual DoubleSchemeType schemeType() = 0; + // ------------------------------------------------------------------------------------- + inline string selfDescription() { + return ConvertSchemeTypeToString(this->schemeType()); + } + virtual string fullDescription(const u8 *) { + // Default implementation for schemes that do not have nested schemes + return this->selfDescription(); + } + virtual bool isUsable(DoubleStats &) { + return true; + } +}; +// ------------------------------------------------------------------------------------- +// String +// ------------------------------------------------------------------------------------- +class StringScheme { +public: + // ------------------------------------------------------------------------------------- + virtual double expectedCompressionRatio(StringStats &stats, u8 allowed_cascading_level) = 0; + + // ------------------------------------------------------------------------------------- + // TODO get rid of this function + virtual bool usesFsst(const u8* src) { (void)src; return false; } + // ------------------------------------------------------------------------------------- + virtual u32 compress(StringArrayViewer src, const BITMAP *nullmap, u8 *dest, StringStats &stats) = 0; + // ------------------------------------------------------------------------------------- + virtual u32 getDecompressedSize(const u8 *src, u32 tuple_count, BitmapWrapper *nullmap) = 0; + // ------------------------------------------------------------------------------------- + virtual u32 getDecompressedSizeNoCopy(const u8 *src, u32 tuple_count, BitmapWrapper *nullmap) { + return this->getDecompressedSize(src, tuple_count, nullmap); + } + // ------------------------------------------------------------------------------------- + virtual u32 getTotalLength(const u8 *src, u32 tuple_count, BitmapWrapper *nullmap) = 0; + // ------------------------------------------------------------------------------------- + virtual void decompress(u8 *dest, BitmapWrapper *nullmap, const u8 *src, u32 tuple_count, u32 level) = 0; + // ------------------------------------------------------------------------------------- + virtual bool decompressNoCopy(u8 *dest, BitmapWrapper *nullmap, const u8 *src, u32 tuple_count, u32 level) { + // The string representation is the same in any case. Still performing the copy does not lead to a wrong result. + this->decompress(dest, nullmap, src, tuple_count, level); + return false; + } + // ------------------------------------------------------------------------------------- + virtual StringSchemeType schemeType() = 0; + // ------------------------------------------------------------------------------------- + inline string selfDescription(const u8 *src = nullptr) { + auto description = ConvertSchemeTypeToString(this->schemeType()); + // TODO clean this up once we are done + if (this->schemeType() == StringSchemeType::S_DICT) { + if (src == nullptr) { + description += "_UNKNOWN"; + } else if (this->usesFsst(src)) { + description += "_FSST"; + } else { + description += "_RAW"; + } + } + return description; + } + virtual string fullDescription(const u8 *) { + // Default implementation for schemes that do not have nested schemes + return this->selfDescription(); + } + virtual bool isUsable(StringStats &) { + return true; + } +}; +// ------------------------------------------------------------------------------------- +} +} +// ------------------------------------------------------------------------------------- +#define MAX_STR_LENGTH 2048 * 4 + +/* + * + * General plan: + * each scheme needs only to register its offset from the datablock + * and it can define a structure e.g (header; slots[];..) and cast the beginning pointer to its type + * + */ diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/schemes/CSchemePicker.cpp b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/CSchemePicker.cpp new file mode 100644 index 0000000..88d5f02 --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/CSchemePicker.cpp @@ -0,0 +1,22 @@ +#include "CScheme.hpp" +#include "CSchemePool.hpp" +#include "utils/Utils.hpp" +// ------------------------------------------------------------------------------------- +namespace cengine { +namespace db { +bool shouldUseFOR(str min) +{ + return false; +} +bool shouldUseFOR(DOUBLE min) +{ + return false; + return (Utils::getBitsNeeded(static_cast(min)) >= 8); +} +bool shouldUseFOR(INTEGER min) +{ + return false; + return (Utils::getBitsNeeded(min) >= 8); +} +} +} diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/schemes/CSchemePicker.hpp b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/CSchemePicker.hpp new file mode 100644 index 0000000..4d1a43f --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/CSchemePicker.hpp @@ -0,0 +1,291 @@ +#pragma once +#include "datablock/Datablock.hpp" +#include "CScheme.hpp" +#include "CSchemePool.hpp" +#include "utils/Utils.hpp" +#include "datablock/cache/ThreadCache.hpp" +// ------------------------------------------------------------------------------------- +#include "gflags/gflags.h" +#include "spdlog/spdlog.h" +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +DECLARE_uint32(force_scheme); +DECLARE_bool(try_all_schemes); +DECLARE_uint32(force_integer_scheme); +DECLARE_uint32(force_double_scheme); +DECLARE_uint32(force_string_scheme); +// ------------------------------------------------------------------------------------- +DECLARE_bool(sampling_test_mode); +// ------------------------------------------------------------------------------------- +namespace cengine { +namespace db { +// ------------------------------------------------------------------------------------- +template +class TypeWrapper { + +}; +// ------------------------------------------------------------------------------------- +bool shouldUseFOR(str min); +bool shouldUseFOR(DOUBLE min); +bool shouldUseFOR(INTEGER min); +// ------------------------------------------------------------------------------------- +template +class CSchemePicker { +public: + using MyTypeWrapper = TypeWrapper; + static SchemeType &chooseScheme(StatsType &stats, u8 allowed_cascading_level) + { + double max_compression_ratio = 0; + if ( MyTypeWrapper::getOverrideScheme() != AUTO_SCHEME ) { + auto scheme_code = MyTypeWrapper::getOverrideScheme(); + MyTypeWrapper::getOverrideScheme() = AUTO_SCHEME; + return MyTypeWrapper::getScheme(scheme_code); + } else { + SchemeType *preferred_scheme = nullptr; + for ( auto &scheme: MyTypeWrapper::getSchemes()) { + if ( ThreadCache::get().estimation_level != 0 || ThreadCache::get().compression_level > 1 ) { + if ( scheme.second->schemeType() == SchemeCodeType::ONE_VALUE ) { + continue; + } + } + + if (!scheme.second->isUsable(stats)) { + continue; + } + + auto compression_ratio = scheme.second->expectedCompressionRatio(stats, allowed_cascading_level); + if ( compression_ratio > max_compression_ratio ) { + max_compression_ratio = compression_ratio; + preferred_scheme = scheme.second.get(); + } + } + if ( max_compression_ratio < 1.0 ) { + throw Generic_Exception("compression ratio lower than 1"); + } + if ( preferred_scheme == nullptr ) { + throw Generic_Exception("No compression scheme found for the input !!"); + } + return *preferred_scheme; + } + } + // ------------------------------------------------------------------------------------- + static void compress(const Type *src, const BITMAP *nullmap, u8 *dest, u32 tuple_count, u8 allowed_cascading_level, u32 &after_size, u8 &scheme_code, u8 force_scheme = AUTO_SCHEME, string comment = "?") + { + ThreadCache::get().compression_level++; + StatsType stats = StatsType::generateStats(src, nullmap, tuple_count); + SchemeType *preferred_scheme = nullptr; + + if (FLAGS_sampling_test_mode) { + if (allowed_cascading_level == 0 || tuple_count == 0) { + spdlog::debug(MyTypeWrapper::getTypeName() + ": UNCOMPRESSED"); + preferred_scheme = &MyTypeWrapper::getScheme(SchemeCodeType::UNCOMPRESSED); + after_size = preferred_scheme->compress(src, nullmap, dest, stats, 0); + scheme_code = CB(preferred_scheme->schemeType()); + } else { + auto tmp_dest = makeBytesArray(stats.total_size * 10); + u32 least_after_size = std::numeric_limits::max(); + //SchemeType *preferred_scheme = nullptr; + for ( auto &scheme: MyTypeWrapper::getSchemes()) { + if ( scheme.second->expectedCompressionRatio(stats, allowed_cascading_level) > 0 ) { + u32 after_size = scheme.second->compress(src, nullmap, tmp_dest.get(), stats, allowed_cascading_level); + if ( after_size < least_after_size ) { + least_after_size = after_size; + preferred_scheme = scheme.second.get(); + } + } + } + die_if(preferred_scheme != nullptr); + scheme_code = CB(preferred_scheme->schemeType()); + spdlog::debug((MyTypeWrapper::getTypeName() + ": {}").c_str(), scheme_code); + after_size = preferred_scheme->compress(src, nullmap, dest, stats, allowed_cascading_level); + } + ThreadCache::get().compression_level--; + return; + } + + // ALP_CHG + // make the first condition always false to disable PED. + if ((ThreadCache::get().estimation_level == 0 && ThreadCache::get().compression_level == 1) && (stats.null_count == stats.tuple_count || stats.unique_count == 1) && false) { + spdlog::debug(MyTypeWrapper::getTypeName() + ": ONE_VALUE"); + preferred_scheme = &MyTypeWrapper::getScheme(SchemeCodeType::ONE_VALUE); + scheme_code = CB(preferred_scheme->schemeType()); + after_size = preferred_scheme->compress(src, nullmap, dest, stats, 0); + } else if ( allowed_cascading_level == 0 || tuple_count == 0 ) { + spdlog::debug(MyTypeWrapper::getTypeName() + ": UNCOMPRESSED"); + preferred_scheme = &MyTypeWrapper::getScheme(SchemeCodeType::UNCOMPRESSED); + after_size = preferred_scheme->compress(src, nullmap, dest, stats, 0); + scheme_code = CB(preferred_scheme->schemeType()); + } else if ( shouldUseFOR(stats.min) && allowed_cascading_level > 1 ) { + spdlog::debug(MyTypeWrapper::getTypeName() + ": X_FOR"); + preferred_scheme = &MyTypeWrapper::getScheme(SchemeCodeType::X_FOR); + after_size = preferred_scheme->compress(src, nullmap, dest, stats, allowed_cascading_level); + scheme_code = CB(preferred_scheme->schemeType()); + } else { + if ( FLAGS_try_all_schemes ) { + auto tmp_dest = makeBytesArray(stats.total_size * 10); + u32 least_after_size = std::numeric_limits::max(); + //SchemeType *preferred_scheme = nullptr; + for ( auto &scheme: MyTypeWrapper::getSchemes()) { + if ( scheme.second->expectedCompressionRatio(stats, allowed_cascading_level) > 0 ) { + u32 after_size = scheme.second->compress(src, nullmap, tmp_dest.get(), stats, allowed_cascading_level); + if ( after_size < least_after_size ) { + least_after_size = after_size; + preferred_scheme = scheme.second.get(); + } + } + } + die_if(preferred_scheme != nullptr); + scheme_code = CB(preferred_scheme->schemeType()); + spdlog::debug((MyTypeWrapper::getTypeName() + ": {}").c_str(), scheme_code); + after_size = preferred_scheme->compress(src, nullmap, dest, stats, allowed_cascading_level); + } else { + if ( force_scheme != AUTO_SCHEME ) { + preferred_scheme = &MyTypeWrapper::getScheme(force_scheme); + } else if ( MyTypeWrapper::getOverrideScheme() != AUTO_SCHEME ) { + preferred_scheme = &MyTypeWrapper::getScheme(MyTypeWrapper::getOverrideScheme()); + MyTypeWrapper::getOverrideScheme() = AUTO_SCHEME; + } else { + preferred_scheme = &chooseScheme(stats, allowed_cascading_level); + } + die_if(preferred_scheme != nullptr); + scheme_code = CB(preferred_scheme->schemeType()); + spdlog::debug((MyTypeWrapper::getTypeName() + ": {}").c_str(), scheme_code); + after_size = preferred_scheme->compress(src, nullmap, dest, stats, allowed_cascading_level); + } + } + if ( ThreadCache::get().isOnHotPath()) { + if ( (after_size > stats.total_size) ) { + cerr << "!!! compressed is larger than raw: \nfor : " + comment + " - scheme = " + ConvertSchemeTypeToString(static_cast(scheme_code)) + << " difference = " << after_size - stats.total_size << "." + << " Falling back to uncompressed." + << endl; + preferred_scheme = &MyTypeWrapper::getScheme(SchemeCodeType::UNCOMPRESSED); + scheme_code = CB(preferred_scheme->schemeType()); + after_size = preferred_scheme->compress(src, nullmap, dest, stats, allowed_cascading_level); + } + for ( u8 i = 0; i < 5 - allowed_cascading_level; i++ ) { + ThreadCache::get() << "\t"; + } + ThreadCache::get() << "for : " + comment + " - scheme = " + ConvertSchemeTypeToString(static_cast(scheme_code)) + + " before = " + std::to_string(stats.total_size) + + " after = " + std::to_string(after_size) + + " gain = " + std::to_string(CD(stats.total_size) / CD(after_size)) + + '\n'; + double estimated_cf = preferred_scheme->expectedCompressionRatio(stats, allowed_cascading_level); + ThreadCache::dumpPush(ConvertSchemeTypeToString(static_cast(scheme_code)), + estimated_cf, + stats.total_size, after_size, + stats.unique_count, comment); + + // if ( estimated_cf / (CD(stats.total_size) / CD(after_size)) >= 100 ) { + // for ( u32 row_i = 0; row_i < tuple_count; row_i++ ) { + // if ( nullmap == nullptr || nullmap[row_i] ) { + // cout << src[row_i] << ';'; + // } else { + // cout << "N;"; + // } + // } + // cout << endl; + // } + } + ThreadCache::get().compression_level--; + } +}; +// ------------------------------------------------------------------------------------- +template<> +class TypeWrapper { +public: + // ------------------------------------------------------------------------------------- + static std::unordered_map> &getSchemes() + { + return CSchemePool::available_schemes->integer_schemes; + } + // ------------------------------------------------------------------------------------- + static IntegerScheme &getScheme(IntegerSchemeType code) + { + return *getSchemes()[code]; + } + // ------------------------------------------------------------------------------------- + static IntegerScheme &getScheme(u8 code) + { + return *getSchemes()[static_cast(code)]; + } + // ------------------------------------------------------------------------------------- + static u32 &getOverrideScheme() + { + return FLAGS_force_integer_scheme; + } + // ------------------------------------------------------------------------------------- + static inline string getTypeName() + { + return "INTEGER"; + } +}; +// ------------------------------------------------------------------------------------- +template<> +class TypeWrapper { +public: + // ------------------------------------------------------------------------------------- + static std::unordered_map> &getSchemes() + { + return CSchemePool::available_schemes->double_schemes; + } + // ------------------------------------------------------------------------------------- + static DoubleScheme &getScheme(DoubleSchemeType code) + { + return *getSchemes()[code]; + } + // ------------------------------------------------------------------------------------- + static DoubleScheme &getScheme(u8 code) + { + return *getSchemes()[static_cast(code)]; + } + // ------------------------------------------------------------------------------------- + static u32 &getOverrideScheme() + { + return FLAGS_force_double_scheme; + } + // ------------------------------------------------------------------------------------- + static inline string getTypeName() + { + return "DOUBLE"; + } + // ------------------------------------------------------------------------------------- +}; +// ------------------------------------------------------------------------------------- +template<> +class TypeWrapper { +public: + // ------------------------------------------------------------------------------------- + static std::unordered_map> &getSchemes() + { + return CSchemePool::available_schemes->string_schemes; + } + // ------------------------------------------------------------------------------------- + static StringScheme &getScheme(StringSchemeType code) + { + return *getSchemes()[code]; + } + // ------------------------------------------------------------------------------------- + static StringScheme &getScheme(u8 code) + { + return *getSchemes()[static_cast(code)]; + } + // ------------------------------------------------------------------------------------- + static u32 &getOverrideScheme() + { + return FLAGS_force_string_scheme; + } + // ------------------------------------------------------------------------------------- + static inline string getTypeName() + { + return "STRING"; + } + // ------------------------------------------------------------------------------------- +}; +// ------------------------------------------------------------------------------------- +using IntegerSchemePicker = CSchemePicker; +using DoubleSchemePicker = CSchemePicker; +using StringSchemePicker = CSchemePicker; +} +} diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/schemes/CSchemePool.cpp b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/CSchemePool.cpp new file mode 100644 index 0000000..fba7082 --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/CSchemePool.cpp @@ -0,0 +1,167 @@ +#include "CSchemePool.hpp" +// ------------------------------------------------------------------------------------- +#include "datablock/schemes/v1/integer/OneValue.hpp" +#include "datablock/schemes/v1/integer/Truncation.hpp" +#include "datablock/schemes/v1/integer/Dictionary.hpp" +#include "datablock/schemes/v1/integer/Uncompressed.hpp" +// ------------------------------------------------------------------------------------- +#include "datablock/schemes/v1/double/OneValue.hpp" +#include "datablock/schemes/v1/double/Uncompressed.hpp" +#include "datablock/schemes/v1/double/Dictionary.hpp" +// ------------------------------------------------------------------------------------- +#include "datablock/schemes/v1/string/Dictionary.hpp" +#include "datablock/schemes/v1/string/OneValue.hpp" +#include "datablock/schemes/v1/string/Uncompressed.hpp" +// ------------------------------------------------------------------------------------- +#include "datablock/schemes/v2/string/DynamicDictionary.hpp" +#include "datablock/schemes/v2/string/Fsst.hpp" +// ------------------------------------------------------------------------------------- +#include "datablock/schemes/v2/double/Decimal.hpp" +#include "datablock/schemes/v2/double/MaxExponent.hpp" +#include "datablock/schemes/v2/double/RLE.hpp" +#include "datablock/schemes/v2/double/DynamicDictionary.hpp" +#include "datablock/schemes/v2/double/Frequency.hpp" +#include "datablock/schemes/v2/double/Hacky.hpp" +#include "datablock/schemes/v2/double/DoubleBP.hpp" +// ------------------------------------------------------------------------------------- +#include "datablock/schemes/v2/integer/PBP.hpp" +#include "datablock/schemes/v2/integer/RLE.hpp" +#include "datablock/schemes/v2/integer/FOR.hpp" +#include "datablock/schemes/v2/integer/Frequency.hpp" +#include "datablock/schemes/v2/integer/DynamicDictionary.hpp" +// ------------------------------------------------------------------------------------- +#include "utils/Utils.hpp" +#include "gflags/gflags.h" +#include "spdlog/spdlog.h" +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +DEFINE_bool(db1, false, "Only use the compression schemes in the original Datablocks paper"); +DEFINE_bool(db2, true, ""); +// Integer flags +DEFINE_bool(integer_for, false, ""); // disabled in final version +DEFINE_bool(integer_fbp, true, ""); +DEFINE_bool(integer_pbp, false, ""); +DEFINE_bool(integer_delta, false, ""); // disabled in final version +DEFINE_bool(integer_dictionary, false, ""); +DEFINE_bool(integer_rle, false, ""); +DEFINE_bool(integer_frequency, false, ""); // Disabled because of slow decompression speeds. +// Double flags +DEFINE_bool(double_dictionary, false, ""); +DEFINE_bool(double_rle, false, ""); +DEFINE_bool(double_frequency, false, ""); +DEFINE_bool(double_decimal, false, ""); +DEFINE_bool(double_bp, true, ""); +DEFINE_bool(double_two_way, false, ""); +// String flags +DEFINE_bool(string_dictionary, true, ""); +DEFINE_bool(string_fsst, true, ""); +// ------------------------------------------------------------------------------------- +DEFINE_uint32(force_scheme, 255, ""); +DEFINE_uint32(force_string_scheme, 255, ""); +DEFINE_uint32(force_integer_scheme, 255, ""); +DEFINE_uint32(force_double_scheme, 255, ""); +// ------------------------------------------------------------------------------------- +DEFINE_bool(try_all_schemes, false, ""); +DEFINE_bool(sampling_test_mode, false, ""); +// ------------------------------------------------------------------------------------- +namespace cengine { +namespace db { +// ------------------------------------------------------------------------------------- +SchemesCollection::SchemesCollection() +{ + if ( FLAGS_db2 ) { + // Integer + { + // These two are required. + integer_schemes.emplace(v1::integer::OneValue::staticSchemeType(), make_unique()); + integer_schemes.emplace(v1::integer::Uncompressed::staticSchemeType(), make_unique()); + if (FLAGS_integer_for) { + integer_schemes.emplace(v2::integer::FOR::staticSchemeType(), make_unique()); + } + if (FLAGS_integer_fbp) { + integer_schemes.emplace(v2::integer::FBP::staticSchemeType(), make_unique()); + // Never tried this one. Seems to be incomplete anyway. + //integer_schemes.emplace(v2::integer::EXP_FBP::staticSchemeType(), make_unique()); + } + if (FLAGS_integer_pbp) { + integer_schemes.emplace(v2::integer::PBP::staticSchemeType(), make_unique()); + } + if (FLAGS_integer_delta) { + integer_schemes.emplace(v2::integer::PBP_DELTA::staticSchemeType(), make_unique()); + } + if (FLAGS_integer_dictionary) { + integer_schemes.emplace(v2::integer::DynamicDictionary::staticSchemeType(), make_unique()); + } + if (FLAGS_integer_rle) { + integer_schemes.emplace(v2::integer::RLE::staticSchemeType(), make_unique()); + } + if (FLAGS_integer_frequency) { + integer_schemes.emplace(v2::integer::Frequency::staticSchemeType(), make_unique()); + } + } + + // Double + { + // These two are required. + double_schemes.emplace(v1::d::OneValue::staticSchemeType(), make_unique()); + double_schemes.emplace(v1::d::Uncompressed::staticSchemeType(), make_unique()); + if (FLAGS_double_dictionary) { + double_schemes.emplace(v2::d::DynamicDictionary::staticSchemeType(), make_unique()); + } + if (FLAGS_double_rle) { + double_schemes.emplace(v2::d::RLE::staticSchemeType(), make_unique()); + } + if (FLAGS_double_frequency) { + double_schemes.emplace(v2::d::Frequency::staticSchemeType(), make_unique()); + } + if (FLAGS_double_decimal) { + double_schemes.emplace(v2::d::Decimal::staticSchemeType(), make_unique()); + //double_schemes.emplace(v2::d::MaxExponent::staticSchemeType(), make_unique()); + } + if (FLAGS_double_bp) { + double_schemes.emplace(v2::d::DoubleBP::staticSchemeType(), make_unique()); + } + if (FLAGS_double_two_way) { + double_schemes.emplace(v2::d::Hacky::staticSchemeType(), make_unique()); + } + } + + // String + { + // These two are required. + string_schemes.emplace(v1::string::OneValue::staticSchemeType(), make_unique()); + string_schemes.emplace(v1::string::Uncompressed::staticSchemeType(), make_unique()); + if (FLAGS_string_dictionary) { + string_schemes.emplace(v2::string::DynamicDictionary::staticSchemeType(), + make_unique()); + } + if (FLAGS_string_fsst) { + string_schemes.emplace(v2::string::Fsst::staticSchemeType(), make_unique()); + } + } + } else { + // ------------------------------------------------------------------------------------- + integer_schemes.emplace(v1::integer::OneValue::staticSchemeType(), make_unique()); + integer_schemes.emplace(v1::integer::Uncompressed::staticSchemeType(), make_unique()); + integer_schemes.emplace(v1::integer::Truncation8::staticSchemeType(), make_unique()); + integer_schemes.emplace(v1::integer::Truncation16::staticSchemeType(), make_unique()); + integer_schemes.emplace(v1::integer::Dictionary8::staticSchemeType(), make_unique()); + integer_schemes.emplace(v1::integer::Dictionary16::staticSchemeType(), make_unique()); + // ------------------------------------------------------------------------------------- + double_schemes.emplace(v1::d::OneValue::staticSchemeType(), make_unique()); + double_schemes.emplace(v1::d::Uncompressed::staticSchemeType(), make_unique()); + double_schemes.emplace(v1::d::Dictionary8::staticSchemeType(), make_unique()); + double_schemes.emplace(v1::d::Dictionary16::staticSchemeType(), make_unique()); + // ------------------------------------------------------------------------------------- + string_schemes.emplace(v1::string::OneValue::staticSchemeType(), make_unique()); + string_schemes.emplace(v1::string::Uncompressed::staticSchemeType(), make_unique()); + string_schemes.emplace(v1::string::Dictionary8::staticSchemeType(), make_unique()); + string_schemes.emplace(v1::string::Dictionary16::staticSchemeType(), make_unique()); + } +} +// ------------------------------------------------------------------------------------- +unique_ptr CSchemePool::available_schemes = unique_ptr(nullptr); +// ------------------------------------------------------------------------------------- +} +// ------------------------------------------------------------------------------------- +} diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/schemes/CSchemePool.hpp b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/CSchemePool.hpp new file mode 100644 index 0000000..4f6268c --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/CSchemePool.hpp @@ -0,0 +1,22 @@ +#pragma once +#include "CScheme.hpp" +// ------------------------------------------------------------------------------------- +namespace cengine { +namespace db { +// ------------------------------------------------------------------------------------- +struct SchemesCollection { + std::unordered_map> integer_schemes; + std::unordered_map> double_schemes; + std::unordered_map> string_schemes; + SchemesCollection(); +}; +// ------------------------------------------------------------------------------------- +class CSchemePool { +public: + static unique_ptr available_schemes; + // ------------------------------------------------------------------------------------- + static void refresh(); +}; +// ------------------------------------------------------------------------------------- +} +} \ No newline at end of file diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/schemes/DoubleSchemeType.cpp b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/DoubleSchemeType.cpp new file mode 100644 index 0000000..a218739 --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/DoubleSchemeType.cpp @@ -0,0 +1,36 @@ +#include "Units.hpp" +#include "DoubleSchemeType.hpp" +// ------------------------------------------------------------------------------------- +namespace cengine { +namespace db { +// ------------------------------------------------------------------------------------- +string ConvertSchemeTypeToString(DoubleSchemeType type) +{ + switch ( type ) { + case DoubleSchemeType::X_DECIMAL: + return "X_DECIMAL"; + case DoubleSchemeType::X_RLE: + return "X_RLE"; + case DoubleSchemeType::X_DICT: + return "X_DICT"; + case DoubleSchemeType::X_FREQUENCY: + return "X_FREQUENCY"; + case DoubleSchemeType::X_HACKY: + return "X_HACKY"; + case DoubleSchemeType::ONE_VALUE: + return "ONE_VALUE"; + case DoubleSchemeType::DICTIONARY_8: + return "DICTIONARY_8"; + case DoubleSchemeType::DICTIONARY_16: + return "DICTIONARY_16"; + case DoubleSchemeType::UNCOMPRESSED: + return "UNCOMPRESSED"; + case DoubleSchemeType::X_FOR: + return "X_FOR"; + default: + throw Generic_Exception("Unknown IntegerSchemeType"); + } +} +// ------------------------------------------------------------------------------------- +} +} diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/schemes/DoubleSchemeType.hpp b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/DoubleSchemeType.hpp new file mode 100644 index 0000000..65e2ed0 --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/DoubleSchemeType.hpp @@ -0,0 +1,21 @@ +#pragma once +namespace cengine { +namespace db { +// ------------------------------------------------------------------------------------- +enum class DoubleSchemeType : u8 { + X_DECIMAL, + X_RLE, + X_DICT, + X_FREQUENCY, + X_HACKY, + ONE_VALUE, + DICTIONARY_8, + DICTIONARY_16, + UNCOMPRESSED, + X_FOR, // TODO: dirty hack + DOUBLE_BP +}; +string ConvertSchemeTypeToString(DoubleSchemeType type); +// ------------------------------------------------------------------------------------- +} +} diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/schemes/IntegerSchemeType.cpp b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/IntegerSchemeType.cpp new file mode 100644 index 0000000..c9b897e --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/IntegerSchemeType.cpp @@ -0,0 +1,42 @@ +#include "Units.hpp" +#include "IntegerSchemeType.hpp" +// ------------------------------------------------------------------------------------- +namespace cengine { +namespace db { +// ------------------------------------------------------------------------------------- +string ConvertSchemeTypeToString(IntegerSchemeType type) +{ + switch ( type ) { + case IntegerSchemeType::X_PBP: + return "X_PBP"; + case IntegerSchemeType::X_PBP_DELTA: + return "X_PBP_DETA"; + case IntegerSchemeType::X_FBP: + return "X_FBP"; + case IntegerSchemeType::X_RLE: + return "X_RLE"; + case IntegerSchemeType::X_DICT: + return "X_DICT"; + case IntegerSchemeType::X_FREQUENCY: + return "X_FREQUENCY"; + case IntegerSchemeType::ONE_VALUE: + return "ONE_VALUE"; + case IntegerSchemeType::DICTIONARY_8: + return "DICTIONARY_8"; + case IntegerSchemeType::DICTIONARY_16: + return "DICTIONARY_16"; + case IntegerSchemeType::TRUNCATION_8: + return "TRUNCATION_8"; + case IntegerSchemeType::TRUNCATION_16: + return "TRUNCATION_16"; + case IntegerSchemeType::UNCOMPRESSED: + return "UNCOMPRESSED"; + case IntegerSchemeType::X_FOR: + return "X_FOR"; + default: + throw Generic_Exception("Unknown IntegerSchemeType"); + } +} +// ------------------------------------------------------------------------------------- +} +} diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/schemes/IntegerSchemeType.hpp b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/IntegerSchemeType.hpp new file mode 100644 index 0000000..cab320b --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/IntegerSchemeType.hpp @@ -0,0 +1,23 @@ +#pragma once +namespace cengine { +namespace db { +// ------------------------------------------------------------------------------------- +enum class IntegerSchemeType : u8 { + X_PBP, + X_PBP_DELTA, + X_FBP, + X_RLE, + X_DICT, + X_FREQUENCY, + X_FOR, + ONE_VALUE, + UNCOMPRESSED, + TRUNCATION_8, + TRUNCATION_16, + DICTIONARY_8, + DICTIONARY_16 +}; +string ConvertSchemeTypeToString(IntegerSchemeType type); +// ------------------------------------------------------------------------------------- +} +} diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/schemes/StringSchemeType.cpp b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/StringSchemeType.cpp new file mode 100644 index 0000000..353714d --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/StringSchemeType.cpp @@ -0,0 +1,28 @@ +#include "Units.hpp" +#include "StringSchemeType.hpp" +// ------------------------------------------------------------------------------------- +namespace cengine { +namespace db { +// ------------------------------------------------------------------------------------- +string ConvertSchemeTypeToString(StringSchemeType type) +{ + switch ( type ) { + case StringSchemeType::ONE_VALUE: + return "ONE_VALUE"; + case StringSchemeType::DICTIONARY_8: + return "DICTIONARY_8"; + case StringSchemeType::DICTIONARY_16: + return "DICTIONARY_16"; + case StringSchemeType::S_DICT: + return "S_DICT"; + case StringSchemeType::UNCOMPRESSED: + return "UNCOMPRESSED"; + case StringSchemeType::FSST: + return "FSST"; + default: + throw Generic_Exception("Unknown StringSchemeType"); + } +} +// ------------------------------------------------------------------------------------- +} +} diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/schemes/StringSchemeType.hpp b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/StringSchemeType.hpp new file mode 100644 index 0000000..8df66c0 --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/StringSchemeType.hpp @@ -0,0 +1,16 @@ +#pragma once +namespace cengine { +namespace db { +// ------------------------------------------------------------------------------------- +enum class StringSchemeType : u8 { + ONE_VALUE, + DICTIONARY_8, + DICTIONARY_16, + S_DICT, + UNCOMPRESSED, + FSST +}; +string ConvertSchemeTypeToString(StringSchemeType type); +// ------------------------------------------------------------------------------------- +} +} \ No newline at end of file diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/double/Dictionary.hpp b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/double/Dictionary.hpp new file mode 100644 index 0000000..bae7b56 --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/double/Dictionary.hpp @@ -0,0 +1,64 @@ +#pragma once +#include "datablock/schemes/CScheme.hpp" +#include "datablock/schemes/v1/templated/FixedDictionary.hpp" +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +namespace cengine { +namespace db { +namespace v1 { +namespace d { +// ------------------------------------------------------------------------------------- +class Dictionary8 : public DoubleScheme { +public: + inline double expectedCompressionRatio(DoubleStats &stats, u8 allowed_cascading_level) override + { + return FDictExpectedCompressionRatio(stats); + } + inline u32 compress(const DOUBLE *src, const BITMAP *nullmap, u8 *dest, DoubleStats &stats, u8 allowed_cascading_level) override + { + return FDictCompressColumn(src, nullmap, dest, stats); + } + inline void decompress(DOUBLE *dest, BitmapWrapper *nullmap, const u8 *src, u32 tuple_count, u32 level) override + { + return FDictDecompressColumn(dest, nullmap, src, tuple_count, level); + } + inline virtual DoubleSchemeType schemeType() + { + return staticSchemeType(); + } + inline static DoubleSchemeType staticSchemeType() + { + return DoubleSchemeType::DICTIONARY_8; + } +}; +// ------------------------------------------------------------------------------------- +class Dictionary16 : public DoubleScheme { +public: + inline double expectedCompressionRatio(DoubleStats &stats, u8 allowed_cascading_level) override + { + return FDictExpectedCompressionRatio(stats); + } + inline u32 compress(const DOUBLE *src, const BITMAP *nullmap, u8 *dest, DoubleStats &stats, u8) override + { + return FDictCompressColumn(src, nullmap, dest, stats); + } + void decompress(DOUBLE *dest, BitmapWrapper *bitmap, const u8 *src, u32 tuple_count, u32 level) override + { + return FDictDecompressColumn(dest, bitmap, src, tuple_count, level); + } + inline virtual DoubleSchemeType schemeType() + { + return staticSchemeType(); + } + inline static DoubleSchemeType staticSchemeType() + { + return DoubleSchemeType::DICTIONARY_16; + } +}; +// ------------------------------------------------------------------------------------- +} +} +} +} +// ------------------------------------------------------------------------------------- diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/double/OneValue.cpp b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/double/OneValue.cpp new file mode 100644 index 0000000..830262f --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/double/OneValue.cpp @@ -0,0 +1,47 @@ +#include "Units.hpp" +#include "OneValue.hpp" +#include "datablock/schemes/CScheme.hpp" +#include "storage/Chunk.hpp" +// ------------------------------------------------------------------------------------- +#include "gflags/gflags.h" +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +namespace cengine { +namespace db { +namespace v1 { +namespace d { +// ------------------------------------------------------------------------------------- +double OneValue::expectedCompressionRatio(DoubleStats &stats, u8 allowed_cascading_level) +{ + if ( stats.distinct_values.size() <= 1 ) { + return stats.tuple_count; + } else { + return 0; + } +} +// ------------------------------------------------------------------------------------- +u32 OneValue::compress(const DOUBLE *src, const BITMAP *nullmap, u8 *dest, DoubleStats &stats, u8 allowed_cascading_level) +{ + auto &col_struct = *reinterpret_cast (dest); + if ( src != nullptr ) { + col_struct.one_value = stats.distinct_values.begin()->first; + } else { + col_struct.one_value = NULL_CODE; + } + return sizeof(DOUBLE); +} +// ------------------------------------------------------------------------------------- +void OneValue::decompress(DOUBLE *dest, BitmapWrapper *, const u8 *src, u32 tuple_count, u32 level) +{ + const auto &col_struct = *reinterpret_cast (src); + for ( u32 row_i = 0; row_i < tuple_count; row_i++ ) { // can be further optimized probably + dest[row_i] = col_struct.one_value; + } +} +// ------------------------------------------------------------------------------------- +} +} +} +} +// ------------------------------------------------------------------------------------- diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/double/OneValue.hpp b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/double/OneValue.hpp new file mode 100644 index 0000000..a53cb60 --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/double/OneValue.hpp @@ -0,0 +1,34 @@ +#pragma once +#include "datablock/schemes/CScheme.hpp" +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +namespace cengine { +namespace db { +namespace v1 { +namespace d { +// ------------------------------------------------------------------------------------- +struct OneValueStructure { + DOUBLE one_value; +}; +// ------------------------------------------------------------------------------------- +class OneValue : public DoubleScheme { +public: + double expectedCompressionRatio(DoubleStats &stats, u8 allowed_cascading_level) override; + u32 compress(const DOUBLE *src, const BITMAP *nullmap, u8 *dest, DoubleStats &stats, u8 allowed_cascading_level) override; + void decompress(DOUBLE *dest, BitmapWrapper *bitmap, const u8 *src, u32 tuple_count, u32 level) override; + inline virtual DoubleSchemeType schemeType() + { + return staticSchemeType(); + } + inline static DoubleSchemeType staticSchemeType() + { + return DoubleSchemeType::ONE_VALUE; + } +}; +// ------------------------------------------------------------------------------------- +} +} +} +} +// ------------------------------------------------------------------------------------- diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/double/Uncompressed.cpp b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/double/Uncompressed.cpp new file mode 100644 index 0000000..17416f1 --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/double/Uncompressed.cpp @@ -0,0 +1,35 @@ +#include "Units.hpp" +#include "Uncompressed.hpp" +#include "datablock/schemes/CScheme.hpp" +#include "storage/Chunk.hpp" +// ------------------------------------------------------------------------------------- +#include "gflags/gflags.h" +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +namespace cengine { +namespace db { +namespace v1 { +namespace d { +// ------------------------------------------------------------------------------------- +double Uncompressed::expectedCompressionRatio(DoubleStats &stats, u8 allowed_cascading_level) +{ + return 1.0; +} +// ------------------------------------------------------------------------------------- +u32 Uncompressed::compress(const DOUBLE *src, const BITMAP *nullmap, u8 *dest, DoubleStats &stats, u8 allowed_cascading_level) +{ + std::memcpy(dest, src, stats.total_size); + return stats.total_size; +} +// ------------------------------------------------------------------------------------- +void Uncompressed::decompress(DOUBLE *dest, BitmapWrapper *, const u8 *src, u32 tuple_count, u32 level) +{ + std::memcpy(dest, src, tuple_count * sizeof(DOUBLE)); +} +// ------------------------------------------------------------------------------------- +} +} +} +} +// ------------------------------------------------------------------------------------- diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/double/Uncompressed.hpp b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/double/Uncompressed.hpp new file mode 100644 index 0000000..43064f1 --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/double/Uncompressed.hpp @@ -0,0 +1,30 @@ +#pragma once +#include "datablock/schemes/CScheme.hpp" +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +namespace cengine { +namespace db { +namespace v1 { +namespace d { +// ------------------------------------------------------------------------------------- +class Uncompressed : public DoubleScheme { +public: + double expectedCompressionRatio(DoubleStats &stats, u8 allowed_cascading_level) override; + u32 compress(const DOUBLE *src, const BITMAP *nullmap, u8 *dest, DoubleStats &stats, u8 allowed_cascading_level) override; + void decompress(DOUBLE *dest, BitmapWrapper *bitmap, const u8 *src, u32 tuple_count, u32 level) override; + inline virtual DoubleSchemeType schemeType() + { + return staticSchemeType(); + } + inline static DoubleSchemeType staticSchemeType() + { + return DoubleSchemeType::UNCOMPRESSED; + } +}; +// ------------------------------------------------------------------------------------- +} +} +} +} +// ------------------------------------------------------------------------------------- diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/integer/Dictionary.hpp b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/integer/Dictionary.hpp new file mode 100644 index 0000000..aaadc81 --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/integer/Dictionary.hpp @@ -0,0 +1,73 @@ +#pragma once +#include "datablock/schemes/CScheme.hpp" +#include "datablock/schemes/v1/templated/FixedDictionary.hpp" +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +namespace cengine { +namespace db { +namespace v1 { +namespace integer { +// ------------------------------------------------------------------------------------- +class Dictionary16 : public IntegerScheme { +public: + double expectedCompressionRatio(SInteger32Stats &stats, u8 allowed_cascading_level) override + { + return FDictExpectedCompressionRatio(stats); + } + // ------------------------------------------------------------------------------------- + u32 compress(const INTEGER *src, const BITMAP *nullmap, u8 *dest, SInteger32Stats &stats, u8) override + { + return FDictCompressColumn(src, nullmap, dest, stats); + } + void decompress(INTEGER *dest, BitmapWrapper *nullmap, const u8 *src, u32 tuple_count, u32 level) override + { + return FDictDecompressColumn(dest, nullmap, src, tuple_count, level); + } + // ------------------------------------------------------------------------------------- + inline virtual IntegerSchemeType schemeType() + { + return staticSchemeType(); + } + inline static IntegerSchemeType staticSchemeType() + { + return IntegerSchemeType::DICTIONARY_16; + } + // ------------------------------------------------------------------------------------- + INTEGER lookup(u32) { UNREACHABLE(); } + void scan(Predicate, BITMAP *, const u8 *, u32) { UNREACHABLE(); } +}; +// ------------------------------------------------------------------------------------- +class Dictionary8 : public IntegerScheme { +public: + double expectedCompressionRatio(SInteger32Stats &stats, u8 allowed_cascading_level) override + { + return FDictExpectedCompressionRatio(stats); + } + // ------------------------------------------------------------------------------------- + u32 compress(const INTEGER *src, const BITMAP *nullmap, u8 *dest, SInteger32Stats &stats, u8 allowed_cascading_level) override + { + return FDictCompressColumn(src, nullmap, dest, stats); + } + void decompress(INTEGER *dest, BitmapWrapper *nullmap, const u8 *src, u32 tuple_count, u32 level) override + { + return FDictDecompressColumn(dest, nullmap, src, tuple_count, level); + } + // ------------------------------------------------------------------------------------- + inline virtual IntegerSchemeType schemeType() + { + return staticSchemeType(); + } + inline static IntegerSchemeType staticSchemeType() + { + return IntegerSchemeType::DICTIONARY_8; + } + // ------------------------------------------------------------------------------------- + INTEGER lookup(u32) { UNREACHABLE(); } + void scan(Predicate, BITMAP *, const u8 *, u32) { UNREACHABLE(); } +}; +// ------------------------------------------------------------------------------------- +} +} +} +} \ No newline at end of file diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/integer/OneValue.cpp b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/integer/OneValue.cpp new file mode 100644 index 0000000..f1cd2d9 --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/integer/OneValue.cpp @@ -0,0 +1,50 @@ +#include "Units.hpp" +#include "OneValue.hpp" +#include "datablock/schemes/CScheme.hpp" +#include "storage/Chunk.hpp" +// ------------------------------------------------------------------------------------- +#include "gflags/gflags.h" +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +namespace cengine { +namespace db { +namespace v1 { +namespace integer { +// ------------------------------------------------------------------------------------- +double OneValue::expectedCompressionRatio(SInteger32Stats &stats, u8 allowed_cascading_level) +{ + if ( stats.distinct_values.size() <= 1 ) { + return stats.tuple_count; + } else { + return 0; + } +} +// ------------------------------------------------------------------------------------- +u32 OneValue::compress(const INTEGER *src, const BITMAP *, u8 *dest, SInteger32Stats &stats, u8) +{ + auto &col_struct = *reinterpret_cast (dest); + if ( src != nullptr ) { + col_struct.one_value = stats.distinct_values.begin()->first; + } else { + col_struct.one_value = NULL_CODE; + } + return sizeof(UINTEGER); +} +// ------------------------------------------------------------------------------------- +void OneValue::decompress(INTEGER *dest, BitmapWrapper *, const u8 *src, u32 tuple_count, u32 level) +{ + const auto &col_struct = *reinterpret_cast (src); + for ( u32 row_i = 0; row_i < tuple_count; row_i++ ) { // can be further optimized probably + dest[row_i] = col_struct.one_value; + } +} +// ------------------------------------------------------------------------------------- + INTEGER OneValue::lookup(u32) { UNREACHABLE(); } +void OneValue::scan(Predicate, BITMAP *, const u8 *, u32) { UNREACHABLE(); } +// ------------------------------------------------------------------------------------- +} +} +} +} +// ------------------------------------------------------------------------------------- diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/integer/OneValue.hpp b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/integer/OneValue.hpp new file mode 100644 index 0000000..2dbb0b6 --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/integer/OneValue.hpp @@ -0,0 +1,36 @@ +#pragma once +#include "datablock/schemes/CScheme.hpp" +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +namespace cengine { +namespace db { +namespace v1 { +namespace integer { +// ------------------------------------------------------------------------------------- +struct OneValueStructure { + UINTEGER one_value; +}; +// ------------------------------------------------------------------------------------- +class OneValue : public IntegerScheme { +public: + double expectedCompressionRatio(SInteger32Stats &stats, u8 allowed_cascading_level) override; + u32 compress(const INTEGER *src, const BITMAP *nullmap, u8 *dest, SInteger32Stats &stats, u8 allowed_cascading_level) override; + void decompress(INTEGER *dest, BitmapWrapper *nullmap, const u8 *src, u32 tuple_count, u32 level) override; + inline virtual IntegerSchemeType schemeType() + { + return staticSchemeType(); + } + inline static IntegerSchemeType staticSchemeType() + { + return IntegerSchemeType::ONE_VALUE; + } + INTEGER lookup(u32); + void scan(Predicate, BITMAP *, const u8 *, u32); +}; +// ------------------------------------------------------------------------------------- +} +} +} +} +// ------------------------------------------------------------------------------------- diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/integer/Truncation.cpp b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/integer/Truncation.cpp new file mode 100644 index 0000000..7cdc94d --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/integer/Truncation.cpp @@ -0,0 +1,56 @@ +#include "Units.hpp" +#include "Truncation.hpp" +#include "datablock/schemes/CScheme.hpp" +#include "storage/Chunk.hpp" +// ------------------------------------------------------------------------------------- +#include "gflags/gflags.h" +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +namespace cengine { +namespace db { +namespace v1 { +namespace integer { +// ------------------------------------------------------------------------------------- +double Truncation16::expectedCompressionRatio(SInteger32Stats &stats, u8 allowed_cascading_level) +{ + return ITruncExpectedCF(stats); +} +// ------------------------------------------------------------------------------------- +u32 Truncation16::compress(const INTEGER *src, const BITMAP *nullmap, u8 *dest, SInteger32Stats &stats, u8 allowed_cascading_level) +{ + return ITruncCompress(src, nullmap, dest, stats); +} +// ------------------------------------------------------------------------------------- +void Truncation16::decompress(INTEGER *dest, BitmapWrapper *nullmap, const u8 *src, u32 tuple_count, u32 level) +{ + ITruncDecompress(dest, nullmap, src, tuple_count, level); +} +// ------------------------------------------------------------------------------------- + INTEGER Truncation16::lookup(u32) { UNREACHABLE(); } +void Truncation16::scan(Predicate, BITMAP *, const u8 *, u32) { UNREACHABLE(); } +// ------------------------------------------------------------------------------------- +// Truncation with 8 bits +// ------------------------------------------------------------------------------------- +double Truncation8::expectedCompressionRatio(SInteger32Stats &stats, u8 allowed_cascading_level) +{ + return ITruncExpectedCF(stats); +} +// ------------------------------------------------------------------------------------- +u32 Truncation8::compress(const INTEGER *src, const BITMAP *nullmap, u8 *dest, SInteger32Stats &stats, u8 allowed_cascading_level) +{ + return ITruncCompress(src, nullmap, dest, stats); +} +// ------------------------------------------------------------------------------------- +void Truncation8::decompress(INTEGER *dest, BitmapWrapper *nullmap, const u8 *src, u32 tuple_count, u32 level) +{ + ITruncDecompress(dest, nullmap, src, tuple_count, level); +} +// ------------------------------------------------------------------------------------- +void Truncation8::scan(Predicate, BITMAP *, const u8 *, u32) { UNREACHABLE(); } +INTEGER Truncation8::lookup(u32) { UNREACHABLE(); } +} +} +} +} +// ------------------------------------------------------------------------------------- diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/integer/Truncation.hpp b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/integer/Truncation.hpp new file mode 100644 index 0000000..6d9007f --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/integer/Truncation.hpp @@ -0,0 +1,117 @@ +#pragma once +#include "datablock/schemes/CScheme.hpp" +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +namespace cengine { +namespace db { +namespace v1 { +namespace integer { +// ------------------------------------------------------------------------------------- +class Truncation16 : public IntegerScheme { +public: + double expectedCompressionRatio(SInteger32Stats &stats, u8 allowed_cascading_level) override; + u32 compress(const INTEGER *src, const BITMAP *nullmap, u8 *dest, SInteger32Stats &stats, u8 allowed_cascading_level) override; + void decompress(INTEGER *dest, BitmapWrapper *nullmap, const u8 *src, u32 tuple_count, u32 level) override; + inline virtual IntegerSchemeType schemeType() + { + return staticSchemeType(); + } + inline static IntegerSchemeType staticSchemeType() + { + return IntegerSchemeType::TRUNCATION_16; + } + // ------------------------------------------------------------------------------------- + INTEGER lookup(u32); + void scan(Predicate, BITMAP *, const u8 *, u32); + virtual bool canCompress(SInteger32Stats &stats) + { + return stats.max - stats.min <= std::numeric_limits::max(); + } +}; +// ------------------------------------------------------------------------------------- +class Truncation8 : public IntegerScheme { +public: + double expectedCompressionRatio(SInteger32Stats &stats, u8 allowed_cascading_level) override; + u32 compress(const INTEGER *src, const BITMAP *nullmap, u8 *dest, SInteger32Stats &stats, u8 allowed_cascading_level) override; + void decompress(INTEGER *dest, BitmapWrapper *nullmap, const u8 *src, u32 tuple_count, u32 level) override; + inline virtual IntegerSchemeType schemeType() + { + return staticSchemeType(); + } + inline static IntegerSchemeType staticSchemeType() + { + return IntegerSchemeType::TRUNCATION_8; + } + // ------------------------------------------------------------------------------------- + INTEGER lookup(u32); + void scan(Predicate, BITMAP *, const u8 *, u32); + virtual bool canCompress(SInteger32Stats &stats) + { + return stats.max - stats.min <= std::numeric_limits::max(); + } +}; +// ------------------------------------------------------------------------------------- +template +struct TruncationStructure { + INTEGER base; + CodeType truncated_values[]; +}; +// ------------------------------------------------------------------------------------- +template +double ITruncExpectedCF(cengine::db::SInteger32Stats &stats) +{ + if ( stats.max - stats.min <= (std::numeric_limits::max())) { + return sizeof(INTEGER) / sizeof(CodeType); + } else { + return 0; + } +} +// ------------------------------------------------------------------------------------- +template +double ITruncCompress(const INTEGER *src, const BITMAP *nullmap, u8 *dest, SInteger32Stats &stats) +{ + die_if(stats.max - stats.min <= std::numeric_limits::max()); + // ------------------------------------------------------------------------------------- + auto &col_struct = *reinterpret_cast *> (dest); + // ------------------------------------------------------------------------------------- + // Set the base + col_struct.base = stats.min; + // ------------------------------------------------------------------------------------- + // Truncate each integer + for ( u32 row_i = 0; row_i < stats.tuple_count; row_i++ ) { + if ( nullmap == nullptr || nullmap[row_i] ) { + auto biased_value = static_cast(src[row_i] - col_struct.base); + col_struct.truncated_values[row_i] = biased_value; + } + } + // ------------------------------------------------------------------------------------- + return sizeof(TruncationStructure) + (sizeof(CodeType) * stats.tuple_count); +} +// ------------------------------------------------------------------------------------- +template +void ITruncDecompress(INTEGER *dest, BitmapWrapper *nullmap, const u8 *src, u32 tuple_count, u32 level) +{ + /* As of now Truncation is unused and this part is commented out for simpler refactoring */ + const auto &col_struct = *reinterpret_cast *> (src); + UNREACHABLE() +// // ------------------------------------------------------------------------------------- +// if (nullmap == nullptr || nullmap->type() == BitmapType::ALLONES) { +// for (u32 row_i = 0; row_i < tuple_count; row_i++) { +// dest[row_i] = col_struct.base + col_struct.truncated_values[row_i]; +// } +// } else if(nullmap->type() == BitmapType::ALLZEROS) { +// return; +// } else { +// for (u32 row_i = 0; row_i < tuple_count; row_i++) { +// if (nullmap->test(row_i)) { +// dest[row_i] = col_struct.base + col_struct.truncated_values[row_i]; +// } +// } +// } +} +// ------------------------------------------------------------------------------------- +} +} +} +} \ No newline at end of file diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/integer/Uncompressed.cpp b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/integer/Uncompressed.cpp new file mode 100644 index 0000000..b26c071 --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/integer/Uncompressed.cpp @@ -0,0 +1,39 @@ +#include "Units.hpp" +#include "Uncompressed.hpp" +#include "datablock/schemes/CScheme.hpp" +#include "storage/Chunk.hpp" +// ------------------------------------------------------------------------------------- +#include "gflags/gflags.h" +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +namespace cengine { +namespace db { +namespace v1 { +namespace integer { +// ------------------------------------------------------------------------------------- +double Uncompressed::expectedCompressionRatio(SInteger32Stats &, u8 allowed_cascading_level) +{ + return 1.0; +} +// ------------------------------------------------------------------------------------- +u32 Uncompressed::compress(const INTEGER *src, const BITMAP *, u8 *dest, SInteger32Stats &stats, u8 allowed_cascading_level) +{ + const u32 column_size = stats.total_size; + std::memcpy(dest, src, column_size); + return column_size; +} +// ------------------------------------------------------------------------------------- +void Uncompressed::decompress(INTEGER *dest, BitmapWrapper *, const u8 *src, u32 tuple_count, u32 level) +{ + const u32 column_size = tuple_count * sizeof(UINTEGER); + std::memcpy(dest, src, column_size); +} +// ------------------------------------------------------------------------------------- + INTEGER Uncompressed::lookup(u32) { UNREACHABLE(); } +void Uncompressed::scan(Predicate, BITMAP *, const u8 *, u32) { UNREACHABLE(); } +} +} +} +} +// ------------------------------------------------------------------------------------- diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/integer/Uncompressed.hpp b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/integer/Uncompressed.hpp new file mode 100644 index 0000000..e9f1673 --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/integer/Uncompressed.hpp @@ -0,0 +1,33 @@ +#pragma once +#include "datablock/schemes/CScheme.hpp" +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +namespace cengine { +namespace db { +namespace v1 { +namespace integer { +// ------------------------------------------------------------------------------------- +class Uncompressed : public IntegerScheme { +public: + double expectedCompressionRatio(SInteger32Stats &stats, u8 allowed_cascading_level) override; + u32 compress(const INTEGER *src, const BITMAP *nullmap, u8 *dest, SInteger32Stats &stats, u8 allowed_cascading_level) override; + void decompress(INTEGER *dest, BitmapWrapper *nullmap, const u8 *src, u32 tuple_count, u32 level) override; + inline virtual IntegerSchemeType schemeType() + { + return staticSchemeType(); + } + inline static IntegerSchemeType staticSchemeType() + { + return IntegerSchemeType::UNCOMPRESSED; + } + // ------------------------------------------------------------------------------------- + INTEGER lookup(u32); + void scan(Predicate, BITMAP *, const u8 *, u32); +}; +// ------------------------------------------------------------------------------------- +} +} +} +} +// ------------------------------------------------------------------------------------- diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/string/Dictionary.cpp b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/string/Dictionary.cpp new file mode 100644 index 0000000..bc94b9a --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/string/Dictionary.cpp @@ -0,0 +1,71 @@ +#include "Units.hpp" +#include "Dictionary.hpp" +#include "datablock/schemes/CScheme.hpp" +#include "storage/Chunk.hpp" +#include "datablock/schemes/v1/templated/VarDictionary.hpp" +// ------------------------------------------------------------------------------------- +#include "gflags/gflags.h" +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +namespace cengine { +namespace db { +namespace v1 { +namespace string { +// ------------------------------------------------------------------------------------- +double Dictionary8::expectedCompressionRatio(StringStats &stats, u8 allowed_cascading_level) +{ + return VDictExpectedCompressionRatio(stats); +} +// ------------------------------------------------------------------------------------- +u32 Dictionary8::compress(const StringArrayViewer src, const BITMAP *nullmap, u8 *dest, StringStats &stats) +{ + return VDictCompressColumn(src, nullmap, dest, stats); +} +// ------------------------------------------------------------------------------------- +u32 Dictionary8::getDecompressedSize(const u8 *src, u32 tuple_count, BitmapWrapper *nullmap) +{ + return VDictGetDecompressedSize(src, tuple_count); +} +// ------------------------------------------------------------------------------------- +void Dictionary8::decompress(u8 *dest, BitmapWrapper *nullmap, const u8 *src, u32 tuple_count, u32 level) +{ + return VDictDecompressColumn(dest, nullmap, src, tuple_count, level); +} + +u32 Dictionary8::getTotalLength(const u8 *src, u32 tuple_count, BitmapWrapper *nullmap) { + throw Generic_Exception("not implemented"); +} + +// ------------------------------------------------------------------------------------- +// 16 bits codes +// ------------------------------------------------------------------------------------- +double Dictionary16::expectedCompressionRatio(StringStats &stats, u8 allowed_cascading_level) +{ + return VDictExpectedCompressionRatio(stats); +} +// ------------------------------------------------------------------------------------- +u32 Dictionary16::compress(const StringArrayViewer src, const BITMAP *nullmap, u8 *dest, StringStats &stats) +{ + return VDictCompressColumn(src, nullmap, dest, stats); +} +// ------------------------------------------------------------------------------------- +u32 Dictionary16::getDecompressedSize(const u8 *src, u32 tuple_count, BitmapWrapper *nullmap) +{ + return VDictGetDecompressedSize(src, tuple_count); +} +// ------------------------------------------------------------------------------------- +void Dictionary16::decompress(u8 *dest, BitmapWrapper *nullmap, const u8 *src, u32 tuple_count, u32 level) +{ + return VDictDecompressColumn(dest, nullmap, src, tuple_count, level); +} + +u32 Dictionary16::getTotalLength(const u8 *src, u32 tuple_count, BitmapWrapper *nullmap) { + throw Generic_Exception("not implemented"); +} +// ------------------------------------------------------------------------------------- +} +} +} +} +// ------------------------------------------------------------------------------------- diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/string/Dictionary.hpp b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/string/Dictionary.hpp new file mode 100644 index 0000000..39f2211 --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/string/Dictionary.hpp @@ -0,0 +1,50 @@ +#pragma once +#include "datablock/schemes/CScheme.hpp" +// ------------------------------------------------------------------------------------- +#include "gflags/gflags.h" +#include "spdlog/spdlog.h" +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +namespace cengine { +namespace db { +namespace v1 { +namespace string { +class Dictionary8 : public StringScheme { +public: + double expectedCompressionRatio(StringStats &stats, u8 allowed_cascading_level) override; + u32 compress(const StringArrayViewer src, const BITMAP *nullmap, u8 *dest, StringStats &stats) override; + u32 getDecompressedSize(const u8 *src, u32 tuple_count, BitmapWrapper *nullmap) override; + u32 getTotalLength(const u8 *src, u32 tuple_count, BitmapWrapper *nullmap) override; + void decompress(u8 *dest, BitmapWrapper *nullmap, const u8 *src, u32 tuple_count, u32 level) override; + inline virtual StringSchemeType schemeType() + { + return staticSchemeType(); + } + inline static StringSchemeType staticSchemeType() + { + return StringSchemeType::DICTIONARY_8; + } +}; +// ------------------------------------------------------------------------------------- +class Dictionary16 : public StringScheme { +public: + double expectedCompressionRatio(StringStats &stats, u8 allowed_cascading_level) override; + u32 compress(const StringArrayViewer src, const BITMAP *bitmap, u8 *dest, StringStats &stats) override; + u32 getDecompressedSize(const u8 *src, u32 tuple_count, BitmapWrapper *nullmap) override; + u32 getTotalLength(const u8 *src, u32 tuple_count, BitmapWrapper *nullmap) override; + void decompress(u8 *dest, BitmapWrapper *nullmap, const u8 *src, u32 tuple_count, u32 level) override; + inline virtual StringSchemeType schemeType() + { + return staticSchemeType(); + } + inline static StringSchemeType staticSchemeType() + { + return StringSchemeType::DICTIONARY_16; + } +}; +// ------------------------------------------------------------------------------------- +} +} +} +} +// ------------------------------------------------------------------------------------- diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/string/OneValue.cpp b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/string/OneValue.cpp new file mode 100644 index 0000000..69a7779 --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/string/OneValue.cpp @@ -0,0 +1,177 @@ +#include "Units.hpp" +#include "OneValue.hpp" +#include "datablock/schemes/CScheme.hpp" +#include "storage/Chunk.hpp" +#include "storage/StringPointerArrayViewer.hpp" +// ------------------------------------------------------------------------------------- +#include "gflags/gflags.h" +#include "utils/Utils.hpp" +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +namespace cengine { +namespace db { +namespace v1 { +namespace string { +// ------------------------------------------------------------------------------------- +double OneValue::expectedCompressionRatio(StringStats &stats, u8 allowed_cascading_level) +{ + if ( stats.distinct_values.size() <= 1 ) { + return stats.tuple_count; + } else { + return 0; + } +} +// ------------------------------------------------------------------------------------- +u32 OneValue::compress(const StringArrayViewer, const BITMAP *bitmap, u8 *dest, StringStats &stats) +{ + auto &col_struct = *reinterpret_cast(dest); + const auto one_value = *stats.distinct_values.begin(); + col_struct.length = one_value.length(); + std::memcpy(col_struct.data, one_value.data(), col_struct.length); + return col_struct.length + sizeof(OneValueStructure); +} +// ------------------------------------------------------------------------------------- +u32 OneValue::getDecompressedSizeNoCopy(const u8 *src, u32 tuple_count, BitmapWrapper *) +{ + auto &col_struct = *reinterpret_cast(src); + u32 total_size = tuple_count * sizeof(StringPointerArrayViewer::View); + total_size += col_struct.length; + return total_size; +} +// ------------------------------------------------------------------------------------- +u32 OneValue::getDecompressedSize(const u8 *src, u32 tuple_count, BitmapWrapper *nullmap) +{ + /* + * IMPORTANT: We use a custom decompressNoCopy. This is probably dead code for now, but could be reused for actually + * copying every individual string + */ + auto &col_struct = *reinterpret_cast(src); + u32 total_size = (tuple_count + 1) * sizeof(StringArrayViewer::Slot); + total_size += nullmap->cardinality() * col_struct.length; + return total_size; +} +// ------------------------------------------------------------------------------------- +void OneValue::decompress(u8 *dest, BitmapWrapper *nullmap, const u8 *src, u32 tuple_count, u32 level) +{ + /* + * IMPORTANT: We use a custom decompressNoCopy. This is probably dead code for now, but could be reused for actually + * copying every individual string + */ + auto &col_struct = *reinterpret_cast(src); + auto dest_slots = reinterpret_cast(dest); + auto dest_strings = dest + ((tuple_count + 1) * sizeof(StringArrayViewer::Slot)); + // ------------------------------------------------------------------------------------- + u32 write_offset = (tuple_count + 1) * sizeof(StringArrayViewer::Slot); + + if (nullmap == nullptr || nullmap->type() == BitmapType::ALLONES) { + Utils::writeOffsetsU32(reinterpret_cast(dest), write_offset, col_struct.length, tuple_count); + Utils::multiplyString(reinterpret_cast(dest_strings), reinterpret_cast(col_struct.data), col_struct.length, tuple_count, 1); + write_offset += tuple_count * col_struct.length; + } else if (nullmap->type() == BitmapType::ALLZEROS) { + // Everything is null. content of values does not matter + //Utils::multiplyU32(reinterpret_cast(dest_slots), &write_offset, tuple_count); + return; + } else { + /* + * TODO the code here needs more testing and investigation. + */ + Roaring &r = nullmap->roaring(); + if (nullmap->type() == BitmapType::REGULAR) { + std::tuple param = {dest_slots, write_offset, col_struct.length}; + r.iterate([](uint32_t value, void *param) { + auto p = reinterpret_cast *>(param); + // TODO this actually looks wrong. Length calculation will probably not work afterwards + // Set offset of string at value to current offset + std::get<0>(*p)[value].offset = std::get<1>(*p); + // Advance offset by string length + std::get<1>(*p) += std::get<2>(*p); + // Set offset of next string to the advanced offset. + // In case the string null this is necessary because the offset would otherwise not be set and + // calculating the length of the string at value would not yield a correct result. + std::get<0>(*p)[value+1].offset = std::get<1>(*p); + return true; + }, + ¶m + ); + } else { // FLIPPED + // The roaring map is inverted + // Every value iterate return is actually a null value. + // We therefore write offsets from the last index we know is not null to the index that holds a null value. + // The last index is the set to one after the null value, so wie do effectively skip it. + std::tuple param = {dest_slots, write_offset, col_struct.length, 0}; + r.iterate([](uint32_t value, void *param) { + auto p = reinterpret_cast *>(param); + // Calculate how many offsets we need to fill (value - last non-null index) + 1 + // +1 for the null value so string length calculations don't break for non-null values + u32 n = value - std::get<3>(*p) + 1; + Utils::writeOffsetsU32(reinterpret_cast (std::get<0>(*p) + std::get<3>(*p)), std::get<1>(*p), std::get<2>(*p), n); + + // We only wrote n-1 actual string (because the last one was a null value) + n--; + // Advance write_offset by number of strings written time string length + std::get<1>(*p) += n * std::get<2>(*p); + // Adjust last non null index. + std::get<3>(*p) = value + 1; + return true; + }, + ¶m + ); + + // Write non null offsets at the end. + // offset at tuple_count will be written at end of function + u32 n = tuple_count - std::get<3>(param); + Utils::writeOffsetsU32(reinterpret_cast (std::get<0>(param) + std::get<3>(param)), std::get<1>(param), std::get<2>(param), n); + write_offset = std::get<1>(param) + n * std::get<2>(param); + } + + Utils::multiplyString(reinterpret_cast(dest_strings), reinterpret_cast(col_struct.data), col_struct.length, nullmap->cardinality(), 1); + } + + dest_slots[tuple_count].offset = write_offset; +} + +bool OneValue::decompressNoCopy(u8 *dest, BitmapWrapper *nullmap, const u8 *src, u32 tuple_count, u32) { + if (nullmap->type() == BitmapType::ALLZEROS) { + return true; + } + + auto &col_struct = *reinterpret_cast(src); + + // We only have one string write the same offset everywhere + auto dest_views = reinterpret_cast(dest); + StringPointerArrayViewer::View view = { + .length = col_struct.length, + .offset = static_cast(tuple_count * sizeof(StringPointerArrayViewer::View)) + }; + +#ifdef BTR_USE_SIMD + auto dest_view_simd = reinterpret_cast<__m256i *>(dest_views); + auto *data = reinterpret_cast(&view); + __m256i data_v = _mm256_set1_epi64x(*data); + for (u32 idx = 0; idx < tuple_count; idx += 16) { + _mm256_storeu_si256(dest_view_simd, data_v); + _mm256_storeu_si256(dest_view_simd + 1, data_v); + _mm256_storeu_si256(dest_view_simd + 2, data_v); + _mm256_storeu_si256(dest_view_simd + 3, data_v); + dest_view_simd += 4; + } +#else + std::memset(dest_views, *reinterpret_cast(&view), tuple_count * sizeof(long long)); +#endif + + auto dest_strings = reinterpret_cast(dest_views + tuple_count); + std::memcpy(dest_strings, col_struct.data, col_struct.length); + return true; +} + +u32 OneValue::getTotalLength(const u8 *src, u32, BitmapWrapper *nullmap) { + auto &col_struct = *reinterpret_cast(src); + return nullmap->cardinality() * col_struct.length; +} +// ------------------------------------------------------------------------------------- +} +} +} +} +// ------------------------------------------------------------------------------------- diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/string/OneValue.hpp b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/string/OneValue.hpp new file mode 100644 index 0000000..91c4439 --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/string/OneValue.hpp @@ -0,0 +1,39 @@ +#pragma once +#include "datablock/schemes/CScheme.hpp" +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +namespace cengine { +namespace db { +namespace v1 { +namespace string { +// ------------------------------------------------------------------------------------- +struct OneValueStructure { + u32 length; + u8 data[]; +}; +// ------------------------------------------------------------------------------------- +class OneValue : public StringScheme { +public: + double expectedCompressionRatio(StringStats &stats, u8 allowed_cascading_level) override; + u32 compress(const StringArrayViewer src, const BITMAP *bitmap, u8 *dest, StringStats &stats) override; + u32 getDecompressedSize(const u8 *src, u32 tuple_count, BitmapWrapper *nullmap) override; + u32 getDecompressedSizeNoCopy(const u8 *src, u32 tuple_count, BitmapWrapper *nullmap) override; + u32 getTotalLength(const u8 *src, u32 tuple_count, BitmapWrapper *nullmap) override; + void decompress(u8 *dest, BitmapWrapper *nullmap, const u8 *src, u32 tuple_count, u32 level) override; + bool decompressNoCopy(u8 *dest, BitmapWrapper *nullmap, const u8 *src, u32 tuple_count, u32 level) override; + inline virtual StringSchemeType schemeType() + { + return staticSchemeType(); + } + inline static StringSchemeType staticSchemeType() + { + return StringSchemeType::ONE_VALUE; + } +}; +// ------------------------------------------------------------------------------------- +} +} +} +} +// ------------------------------------------------------------------------------------- diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/string/Uncompressed.cpp b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/string/Uncompressed.cpp new file mode 100644 index 0000000..2e3716f --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/string/Uncompressed.cpp @@ -0,0 +1,49 @@ +#include "Units.hpp" +#include "Uncompressed.hpp" +#include "datablock/schemes/CScheme.hpp" +#include "storage/Chunk.hpp" +// ------------------------------------------------------------------------------------- +#include "gflags/gflags.h" +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +namespace cengine { +namespace db { +namespace v1 { +namespace string { +// ------------------------------------------------------------------------------------- +double Uncompressed::expectedCompressionRatio(StringStats &stats, u8 allowed_cascading_level) +{ + return 1.0; + //return static_cast(stats.total_length) / static_cast(stats.total_length + sizeof(u32));// actually it is just 1 ! +} +// ------------------------------------------------------------------------------------- +u32 Uncompressed::compress(const StringArrayViewer src, const BITMAP *, u8 *dest, StringStats &stats) +{ + auto &col_struct = *reinterpret_cast(dest); + col_struct.total_size = stats.total_size; + std::memcpy(col_struct.data, src.slots_ptr, stats.total_size); + return stats.total_size + sizeof(UncompressedStructure); +} +// ------------------------------------------------------------------------------------- +u32 Uncompressed::getDecompressedSize(const u8 *src, u32 tuple_count, BitmapWrapper *nullmap) +{ + return reinterpret_cast(src)->total_size; +} +// ------------------------------------------------------------------------------------- +void Uncompressed::decompress(u8 *dest, BitmapWrapper *nullmap, const u8 *src, u32 tuple_count, u32 level) +{ + auto &col_struct = *reinterpret_cast(src); + std::memcpy(dest, col_struct.data, col_struct.total_size); +} + +u32 Uncompressed::getTotalLength(const u8 *src, u32 tuple_count, BitmapWrapper *nullmap) { + auto &col_struct = *reinterpret_cast(src); + return col_struct.total_size - ((tuple_count + 1) * sizeof(StringArrayViewer::Slot)); +} +// ------------------------------------------------------------------------------------- +} +} +} +} +// ------------------------------------------------------------------------------------- diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/string/Uncompressed.hpp b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/string/Uncompressed.hpp new file mode 100644 index 0000000..cf6fe16 --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/string/Uncompressed.hpp @@ -0,0 +1,37 @@ +#pragma once +#include "datablock/schemes/CScheme.hpp" +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +namespace cengine { +namespace db { +namespace v1 { +namespace string { +// ------------------------------------------------------------------------------------- +struct UncompressedStructure { + u32 total_size; + u8 data[]; +}; +// ------------------------------------------------------------------------------------- +class Uncompressed : public StringScheme { +public: + double expectedCompressionRatio(StringStats &stats, u8 allowed_cascading_level) override; + u32 compress(const StringArrayViewer src, const BITMAP *bitmap, u8 *dest, StringStats &stats) override; + u32 getDecompressedSize(const u8 *src, u32 tuple_count, BitmapWrapper *nullmap) override; + u32 getTotalLength(const u8 *src, u32 tuple_count, BitmapWrapper *nullmap) override; + void decompress(u8 *dest, BitmapWrapper *nullmap, const u8 *src, u32 tuple_count, u32 level) override; + inline virtual StringSchemeType schemeType() + { + return staticSchemeType(); + } + inline static StringSchemeType staticSchemeType() + { + return StringSchemeType::UNCOMPRESSED; + } +}; +// ------------------------------------------------------------------------------------- +} +} +} +} +// ------------------------------------------------------------------------------------- diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/templated/FixedDictionary.hpp b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/templated/FixedDictionary.hpp new file mode 100644 index 0000000..a9d500c --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/templated/FixedDictionary.hpp @@ -0,0 +1,74 @@ +#pragma once +#include "datablock/schemes/CScheme.hpp" +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +namespace cengine { +namespace db { +namespace v1 { +template +struct FixedDictionaryStructure { + u32 codes_offset; + NumberType dict_slots[]; +}; +// ------------------------------------------------------------------------------------- +template +inline double FDictExpectedCompressionRatio(StatsType &stats) +{ + if ( stats.unique_count > (std::numeric_limits::max() + 1)) { + return 0; + } else { + const u32 after_size = sizeof(FixedDictionaryStructure) + (stats.unique_count * sizeof(NumberType)) + (stats.tuple_count * sizeof(CodeType)); + return static_cast(stats.total_size) / static_cast(after_size); + } +} +// ------------------------------------------------------------------------------------- +template +inline u32 FDictCompressColumn(const NumberType *src, const BITMAP *, u8 *dest, StatsType &stats) +{ + die_if(stats.distinct_values.size() <= (std::numeric_limits::max() + 1)); + // ------------------------------------------------------------------------------------- + auto &col_struct = *reinterpret_cast *> (dest); + const u32 dict_size = stats.distinct_values.size() * sizeof(NumberType); + col_struct.codes_offset = sizeof(FixedDictionaryStructure) + dict_size; + // ------------------------------------------------------------------------------------- + // Write dictionary + u32 distinct_i = 0; + for ( const auto &distinct_element : stats.distinct_values ) { + col_struct.dict_slots[distinct_i] = distinct_element.first; + distinct_i++; + } + // ------------------------------------------------------------------------------------- + NumberType *dict_begin = col_struct.dict_slots; + NumberType *dict_end = dict_begin + stats.unique_count; + // ------------------------------------------------------------------------------------- + auto codes = reinterpret_cast(dest + col_struct.codes_offset); + for ( u32 row_i = 0; row_i < stats.tuple_count; row_i++ ) { + auto it = std::lower_bound(dict_begin, dict_end, src[row_i]); + if ( it == dict_end ) { + die_if(stats.distinct_values.find(src[row_i]) != stats.distinct_values.end()); + } + die_if(it != dict_end); + codes[row_i] = static_cast(std::distance(dict_begin, it)); + } + // ------------------------------------------------------------------------------------- + return reinterpret_cast(&codes[stats.tuple_count]) - dest; +} +// ------------------------------------------------------------------------------------- +template +inline void FDictDecompressColumn(NumberType *dest, BitmapWrapper *, const u8 *src, u32 tuple_count, u32 level) +{ + // ------------------------------------------------------------------------------------- + const auto &col_struct = *reinterpret_cast *> (src); + // ------------------------------------------------------------------------------------- + // Get codes + const auto codes = reinterpret_cast(src + col_struct.codes_offset); + // ------------------------------------------------------------------------------------- + for ( u32 row_i = 0; row_i < tuple_count; row_i++ ) { + dest[row_i] = col_struct.dict_slots[codes[row_i]]; + } +} +// ------------------------------------------------------------------------------------- +} +} +} \ No newline at end of file diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/templated/VarDictionary.hpp b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/templated/VarDictionary.hpp new file mode 100644 index 0000000..e9a0299 --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v1/templated/VarDictionary.hpp @@ -0,0 +1,91 @@ +#pragma once +#include "datablock/schemes/CScheme.hpp" +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +namespace cengine { +namespace db { +namespace v1 { +// Note: Here the first code is reserved for 0 size (or null) strings +// ------------------------------------------------------------------------------------- +struct VarDictionaryStructure { + u32 total_size; + u32 codes_offset; + u8 data[]; +}; +// ------------------------------------------------------------------------------------- +template +inline double VDictExpectedCompressionRatio(StringStats &stats) +{ + if ( stats.distinct_values.size() > (std::numeric_limits::max())) { + return 0; + } else { + u32 after_size = (stats.tuple_count * (sizeof(CodeType))) + (sizeof(StringArrayViewer::Slot) * (1 + stats.distinct_values.size())) + sizeof(VarDictionaryStructure); + after_size += stats.total_unique_length; + return CD(stats.total_size) / CD(after_size); + } +} +// ------------------------------------------------------------------------------------- +template +inline u32 VDictCompressColumn(const StringArrayViewer src, const BITMAP *, u8 *dest, StringStats &stats) +{ + // Layout: STRINGS | OFFSETS | CODES + // ------------------------------------------------------------------------------------- + die_if(stats.distinct_values.size() <= std::numeric_limits::max()); + // ------------------------------------------------------------------------------------- + auto &col_struct = *reinterpret_cast (dest); + col_struct.total_size = stats.total_size; + vector distinct_values(stats.distinct_values.begin(), stats.distinct_values.end()); + // ------------------------------------------------------------------------------------- + // set is assumed to be implemented as red-black tree with order preserving iterator + auto dest_slot_ptr = reinterpret_cast(col_struct.data); + u8 *str_write_ptr = col_struct.data + ((distinct_values.size() + 1) * sizeof(StringArrayViewer::Slot)); + for ( const auto &distinct_str: distinct_values ) { + dest_slot_ptr++->offset = str_write_ptr - col_struct.data; // Note, string offset is relative to the first slot + std::memcpy(str_write_ptr, distinct_str.data(), distinct_str.length()); + str_write_ptr += distinct_str.length(); + } + dest_slot_ptr->offset = str_write_ptr - col_struct.data; + col_struct.codes_offset = str_write_ptr - col_struct.data; + // ------------------------------------------------------------------------------------- + auto codes_write_ptr = reinterpret_cast(str_write_ptr); + // ------------------------------------------------------------------------------------- + for ( u32 row_i = 0; row_i < stats.tuple_count; row_i++ ) { + const str ¤t_value = src(row_i); + auto it = std::lower_bound(distinct_values.begin(), distinct_values.end(), current_value); + assert(it != distinct_values.end()); + *codes_write_ptr++ = std::distance(distinct_values.begin(), it); + } + return reinterpret_cast(codes_write_ptr) - dest; +} +// ------------------------------------------------------------------------------------- +template +inline u32 VDictGetDecompressedSize(const u8 *src, u32) +{ + return reinterpret_cast (src)->total_size; +} +// ------------------------------------------------------------------------------------- +template +inline void VDictDecompressColumn(u8 *dest, BitmapWrapper *, const u8 *src, u32 tuple_count, u32 level) +{ + const auto &col_struct = *reinterpret_cast (src); + StringArrayViewer dict_array(col_struct.data); + // ------------------------------------------------------------------------------------- + // Prepare output + auto dest_slots = reinterpret_cast(dest); + auto str_write_ptr = dest + sizeof(StringArrayViewer::Slot) * (tuple_count + 1); + // ------------------------------------------------------------------------------------- + const auto codes = reinterpret_cast(col_struct.data + col_struct.codes_offset); + for ( u32 row_i = 0; row_i < tuple_count; row_i++ ) { + auto current_code = codes[row_i]; + auto decoded_str = dict_array(current_code); + dest_slots[row_i].offset = str_write_ptr - dest; + std::memcpy(str_write_ptr, decoded_str.data(), decoded_str.length()); + str_write_ptr += decoded_str.length(); + } + dest_slots[tuple_count].offset = str_write_ptr - dest; +} +// ------------------------------------------------------------------------------------- +} +} +} \ No newline at end of file diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/bitmap/RoaringBitmap.cpp b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/bitmap/RoaringBitmap.cpp new file mode 100644 index 0000000..bb7dca9 --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/bitmap/RoaringBitmap.cpp @@ -0,0 +1,149 @@ +#include "RoaringBitmap.hpp" +// -------------------------------------------------------+------------------------------ +#include "gflags/gflags.h" +#include "spdlog/spdlog.h" +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +namespace cengine::db::v2::bitmap { +BitmapWrapper::BitmapWrapper(const u8 *src, BitmapType type, u32 tuple_count, boost::dynamic_bitset<> *bitset) : m_tuple_count(tuple_count), m_bitset(bitset), m_type(type) { + if (type == BitmapType::ALLONES) { + this->m_cardinality = tuple_count; + return; + } else if (type == BitmapType::ALLZEROS) { + this->m_cardinality = 0; + return; + } + + this->m_roaring = Roaring::read(reinterpret_cast(src), false); + this->m_cardinality = this->m_roaring.cardinality(); + if (type == BitmapType::FLIPPED) { + this->m_cardinality = this->m_tuple_count - this->m_cardinality; + } +} + +BitmapWrapper::~BitmapWrapper() { + delete this->m_bitset; +} + +void BitmapWrapper::writeBITMAP(BITMAP *dest) { + switch (this->m_type) { + case BitmapType::ALLONES: { + for (u32 i = 0; i < this->m_tuple_count; i++) { + dest[i] = 1; + } + break; + } + case BitmapType::ALLZEROS: { + for (u32 i = 0; i < this->m_tuple_count; i++) { + dest[i] = 0; + } + break; + } + default: { + for (u32 i = 0; i < this->m_tuple_count; i++) { + dest[i] = this->get_bitset()->test(i) ? 1 : 0; + } + break; + } + } +} + +void BitmapWrapper::releaseBitset() { + this->m_bitset = nullptr; + this->m_bitset_initialized = false; +} + +boost::dynamic_bitset<> *BitmapWrapper::get_bitset() { + if (this->m_bitset_initialized) { + return this->m_bitset; + } + + if (this->m_bitset == nullptr) { + this->m_bitset = new boost::dynamic_bitset<>(this->m_tuple_count); + } + + switch (this->m_type) { + case BitmapType::ALLONES: { + this->m_bitset->set(); + break; + } + case BitmapType::ALLZEROS: { + this->m_bitset->reset(); + break; + } + case BitmapType::REGULAR: { + this->m_bitset->reset(); + this->m_roaring.iterate([](uint32_t value, void *param) { + auto bitset = reinterpret_cast *>(param); + bitset->set(value, true); + return true; + }, + this->m_bitset + ); + break; + } + case BitmapType::FLIPPED: { + this->m_bitset->set(); + this->m_roaring.iterate([](uint32_t value, void *param) { + auto bitset = reinterpret_cast *>(param); + bitset->set(value, false); + return true; + }, + this->m_bitset + ); + break; + } + default: { + throw Generic_Exception("Unknown BitmapType " + std::to_string(static_cast(this->m_type))); + } + } + + this->m_bitset_initialized = true; + return this->m_bitset; +} + + +// ------------------------------------------------------------------------------------- +std::pair RoaringBitmap::compress(const BITMAP *bitmap, u8 *dest, u32 tuple_count) +{ + // Returns a pair of compressed size and the type of bitmap used + + // Determine the bitmap type by counting 1s + u32 ones_count = 0; + for (u32 i = 0; i < tuple_count; i++) { + ones_count += bitmap[i]; + } + + if (ones_count == 0) { + return {0, BitmapType::ALLZEROS}; + } else if (ones_count == tuple_count) { + return {0, BitmapType::ALLONES}; + } + + BITMAP check_value; + BitmapType type; + if (ones_count < tuple_count/2) { + type = BitmapType::REGULAR; + check_value = 1; + } else { + // There are more 1s than 0s in the Bitmap. In oder to save space and computation time during decompression of + // the roaring bitmap we simply flip the bits. (Less 1s => Smaller Roaring bitmap) + type = BitmapType::FLIPPED; + check_value = 0; + } + + // Write the actual bitmap + Roaring r; + for ( u32 row_i = 0; row_i < tuple_count; row_i++ ) { + if ( bitmap[row_i] == check_value) { + r.add(row_i); + } + } + r.runOptimize(); + r.setCopyOnWrite(true); + u32 compressed_size = r.write(reinterpret_cast(dest), false); + + return {compressed_size, type}; +} +// ------------------------------------------------------------------------------------- +} \ No newline at end of file diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/bitmap/RoaringBitmap.hpp b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/bitmap/RoaringBitmap.hpp new file mode 100644 index 0000000..0f3ef07 --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/bitmap/RoaringBitmap.hpp @@ -0,0 +1,42 @@ +#pragma once +#include "Units.hpp" +#include "roaring/roaring.hh" +#include +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +namespace cengine::db::v2::bitmap { +class BitmapWrapper { +private: + u32 m_tuple_count; + u32 m_cardinality; + Roaring m_roaring; + boost::dynamic_bitset<> *m_bitset = nullptr; + bool m_bitset_initialized = false; + BitmapType m_type; +public: + BitmapWrapper(const u8 *src, BitmapType type, u32 tuple_count, boost::dynamic_bitset<> *bitset = nullptr); + virtual ~BitmapWrapper(); + void writeBITMAP(BITMAP *dest); + boost::dynamic_bitset<> *get_bitset(); + void releaseBitset(); + [[nodiscard]] inline bool test(u32 idx) { + return this->get_bitset()->test(idx); + } + [[nodiscard]] inline u32 cardinality() const { + return this->m_cardinality; + }; + [[nodiscard]] inline BitmapType type() const { + return this->m_type; + }; + [[nodiscard]] inline Roaring &roaring() { + return this->m_roaring; + }; +}; +class RoaringBitmap { +public: + static std::pair compress(const BITMAP *bitmap, u8 *dest, u32 tuple_count); +}; +// ------------------------------------------------------------------------------------- +} +using BitmapWrapper = cengine::db::v2::bitmap::BitmapWrapper; \ No newline at end of file diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/double/Decimal.cpp b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/double/Decimal.cpp new file mode 100644 index 0000000..65fc519 --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/double/Decimal.cpp @@ -0,0 +1,460 @@ +#include "Units.hpp" +#include "Decimal.hpp" +#include "datablock/schemes/CScheme.hpp" +#include "datablock/schemes/CSchemePicker.hpp" +#include "storage/Chunk.hpp" +// ------------------------------------------------------------------------------------- +#include "gflags/gflags.h" +#include "spdlog/spdlog.h" +// ------------------------------------------------------------------------------------- +#include +#include +// ------------------------------------------------------------------------------------- +DEFINE_uint32(decimal_siginifcant_digit_bits_limit, 31, ""); +// ------------------------------------------------------------------------------------- +namespace cengine { +namespace db { +namespace v2 { +namespace d { + +const u32 max_exponent = 22; +const u8 exponent_exception_code = 23; +const u8 decimal_index_mask = 0x1F; +static const double exact_fractions_of_ten[] = { + 1.0, + 0.1, + 0.01, + 0.001, + 0.0001, + 0.00001, + 0.000001, + 0.0000001, + 0.00000001, + 0.000000001, + 0.0000000001, + 0.00000000001, + 0.000000000001, + 0.0000000000001, + 0.00000000000001, + 0.000000000000001, + 0.0000000000000001, + 0.00000000000000001, + 0.000000000000000001, + 0.0000000000000000001, + 0.00000000000000000001, + 0.000000000000000000001, + 0.0000000000000000000001, +}; +static_assert(sizeof(exact_fractions_of_ten) == sizeof(double) * 23); + +// Naive implementation +/* + * habe deine Methode erst in scalar ausprobiert und es funktioniert. wir können doubles mit 9 significant digits (manchmal 10 auch) in 32 uinteger + 4 bits (oder byte) for exponent, speichern +(oder 4 bits vom uint32 für exponent ausleihen und nur bis 8 significant digits unterstützen, ich habe noch nicht genau überlegt wie wir es weiter optimieren können) +und im Dataset gibt es anscheinend Spalten wo die doubles nur in diesem Range liegen. + */ +// ------------------------------------------------------------------------------------- +u32 Decimal::compress(const DOUBLE *src, const BITMAP *, u8 *dest, DoubleStats &stats, u8 allowed_cascading_level) +{ + // Layout : Header | numbers_v | exponent_v | patches_v + //ignore bitmap + auto &col_struct = *reinterpret_cast(dest); + col_struct.variant_selector = 0; + vector numbers_v; + vector exponent_v; + vector patches_v; // patches + + u32 exception_count = 0; + u8 run_count = 0; + + Roaring exceptions_bitmap; + const u32 num_blocks = (stats.tuple_count + (block_size-1)) / block_size; + for (u32 block_i = 0; block_i < num_blocks; block_i++) { + bool block_has_exception = false; + + const u32 row_start_i = block_i * block_size; + const u32 row_end_i = std::min(row_start_i + block_size, stats.tuple_count); + for (u32 row_i = row_start_i; row_i < row_end_i; row_i++) { + DOUBLE current_double = src[row_i]; + + bool convertable = false; + u32 exponent; + u64 converted_number; + if ( current_double == -0.0 && std::signbit(current_double)) { + // Special case -0.0 is handled as exception + exponent = exponent_exception_code; + } else { + // Attempt conversion + for (exponent = 0; exponent <= max_exponent; exponent++) { + DOUBLE cd = current_double / exact_fractions_of_ten[exponent]; + cd = std::round(cd); + converted_number = static_cast(cd); + DOUBLE if_converted_back = static_cast(converted_number) * exact_fractions_of_ten[exponent]; + if (if_converted_back == current_double && + ((std::floor(std::log2(converted_number)) + 1) <= FLAGS_decimal_siginifcant_digit_bits_limit)) { + convertable = true; + break; + } + } + } + + // Write result + if ( convertable ) { + die_if((std::floor(std::log2(converted_number)) + 1) <= 31); + exponent_v.push_back(static_cast(exponent)); + numbers_v.push_back(static_cast(converted_number)); + } else { + block_has_exception = true; + exception_count++; + // ALP_CHG + // START +// if (exception_count > stats.tuple_count/2) { +// // This is a hacky way to avoid using Decimal in columns where there are many exceptions +// // Return a big number will make the selection process select uncompressed rather than Decimal +// return stats.total_size + 1000; +// } + // FINISH + exponent_v.push_back(exponent_exception_code); + patches_v.push_back(src[row_i]); + } + } + + if (block_has_exception) { + run_count = 0; + exceptions_bitmap.add(block_i); + } else { + run_count++; + col_struct.variant_selector |= do_iteration; + if (run_count >= 4) { + col_struct.variant_selector |= do_unroll; + } + } + } + + col_struct.converted_count = numbers_v.size(); + auto write_ptr = col_struct.data; + + // Compress significant digits + if ( !numbers_v.empty()) { + u32 used_space; + IntegerSchemePicker::compress(numbers_v.data(), nullptr, write_ptr, numbers_v.size(), allowed_cascading_level - 1, used_space, col_struct.numbers_scheme, AUTO_SCHEME, "significant digits"); + write_ptr += used_space; + spdlog::debug("Decimal: sd_c = {} sd_s = {}", CI(col_struct.numbers_scheme), CI(used_space)); + } + + // Compress exponents + { + col_struct.exponents_offset = write_ptr - col_struct.data; + u32 used_space; + SInteger32Stats e_stats = SInteger32Stats::generateStats(exponent_v.data(), nullptr, exponent_v.size()); + //cout << e_stats.min << '\t' << e_stats.max << endl; + IntegerSchemePicker::compress(exponent_v.data(), nullptr, write_ptr, exponent_v.size(), allowed_cascading_level - 1, used_space, col_struct.exponents_scheme, AUTO_SCHEME, "exponents"); + write_ptr += used_space; + spdlog::debug("Decimal: e_c = {} e_s = {}", CI(col_struct.exponents_scheme), CI(used_space)); + } + + // Compress patches + { + col_struct.patches_offset = write_ptr - col_struct.data; + u32 used_space; + DoubleSchemePicker::compress(patches_v.data(), nullptr, write_ptr, patches_v.size(), allowed_cascading_level - 1, used_space, col_struct.patches_scheme, AUTO_SCHEME, "patches"); + write_ptr += used_space; + spdlog::debug("Decimal: p_c = {} p_s = {}", CI(col_struct.patches_scheme), CI(used_space)); + } + + // Write exceptions bitmap + { + col_struct.exceptions_map_offset = write_ptr - col_struct.data; + exceptions_bitmap.runOptimize(); + exceptions_bitmap.setCopyOnWrite(true); + write_ptr += exceptions_bitmap.write(reinterpret_cast(write_ptr), false); + } + + return write_ptr - dest; +} + +struct DecimalIterateParam { + u32 next_block_i; + u32 tuple_count; + DOUBLE *write_ptr; + INTEGER *exponents_ptr; + INTEGER *numbers_ptr; + DOUBLE *patches_ptr; +}; + +static inline void decompressExceptionBlock(DecimalIterateParam *param) { + u32 row_start_i = param->next_block_i * block_size; + u32 row_end_i = std::min(row_start_i + block_size, param->tuple_count); + for (u32 row_i = row_start_i; row_i < row_end_i; row_i++) { + INTEGER exponent = *param->exponents_ptr++; + if (exponent == exponent_exception_code ) { + *param->write_ptr++ = *param->patches_ptr++; + } else { + auto number = *param->numbers_ptr++; + u8 exponent_index = exponent & decimal_index_mask; + DOUBLE original_double = static_cast(number) * exact_fractions_of_ten[exponent_index]; + *param->write_ptr++ = original_double; + } + } + param->next_block_i++; +} + +#ifdef BTR_USE_SIMD +static inline void decompressAVXBlock4(DecimalIterateParam *param) { + // Load numbers and convert to double + __m128i numbers_int_0 = _mm_loadu_si128(reinterpret_cast<__m128i *>(param->numbers_ptr) + 0); + __m128i numbers_int_1 = _mm_loadu_si128(reinterpret_cast<__m128i *>(param->numbers_ptr) + 1); + __m128i numbers_int_2 = _mm_loadu_si128(reinterpret_cast<__m128i *>(param->numbers_ptr) + 2); + __m128i numbers_int_3 = _mm_loadu_si128(reinterpret_cast<__m128i *>(param->numbers_ptr) + 3); + + __m256d numbers_double_0 = _mm256_cvtepi32_pd(numbers_int_0); + __m256d numbers_double_1 = _mm256_cvtepi32_pd(numbers_int_1); + __m256d numbers_double_2 = _mm256_cvtepi32_pd(numbers_int_2); + __m256d numbers_double_3 = _mm256_cvtepi32_pd(numbers_int_3); + + // Load exponents and gather the power of ten + __m128i exponents_0 = _mm_loadu_si128(reinterpret_cast<__m128i *>(param->exponents_ptr) + 0); + __m128i exponents_1 = _mm_loadu_si128(reinterpret_cast<__m128i *>(param->exponents_ptr) + 1); + __m128i exponents_2 = _mm_loadu_si128(reinterpret_cast<__m128i *>(param->exponents_ptr) + 2); + __m128i exponents_3 = _mm_loadu_si128(reinterpret_cast<__m128i *>(param->exponents_ptr) + 3); + + // Gather powers + __m256d powers_0 = _mm256_i32gather_pd(exact_fractions_of_ten, exponents_0, 8); + __m256d powers_1 = _mm256_i32gather_pd(exact_fractions_of_ten, exponents_1, 8); + __m256d powers_2 = _mm256_i32gather_pd(exact_fractions_of_ten, exponents_2, 8); + __m256d powers_3 = _mm256_i32gather_pd(exact_fractions_of_ten, exponents_3, 8); + + // Perform division + __m256d results_0 = _mm256_mul_pd(numbers_double_0, powers_0); + __m256d results_1 = _mm256_mul_pd(numbers_double_1, powers_1); + __m256d results_2 = _mm256_mul_pd(numbers_double_2, powers_2); + __m256d results_3 = _mm256_mul_pd(numbers_double_3, powers_3); + + // Store result + _mm256_storeu_pd(param->write_ptr + 0, results_0); + _mm256_storeu_pd(param->write_ptr + 4, results_1); + _mm256_storeu_pd(param->write_ptr + 8, results_2); + _mm256_storeu_pd(param->write_ptr + 12, results_3); + + param->write_ptr += 16; + param->exponents_ptr += 16; + param->numbers_ptr += 16; + param->next_block_i += 4; +} + +static inline void decompressAVXBlock2(DecimalIterateParam *param) { + // Load numbers and convert to double + __m128i numbers_int_0 = _mm_loadu_si128(reinterpret_cast<__m128i *>(param->numbers_ptr) + 0); + __m128i numbers_int_1 = _mm_loadu_si128(reinterpret_cast<__m128i *>(param->numbers_ptr) + 1); + + __m256d numbers_double_0 = _mm256_cvtepi32_pd(numbers_int_0); + __m256d numbers_double_1 = _mm256_cvtepi32_pd(numbers_int_1); + + // Load exponents and gather the power of ten + __m128i exponents_0 = _mm_loadu_si128(reinterpret_cast<__m128i *>(param->exponents_ptr) + 0); + __m128i exponents_1 = _mm_loadu_si128(reinterpret_cast<__m128i *>(param->exponents_ptr) + 1); + + // Gather powers + __m256d powers_0 = _mm256_i32gather_pd(exact_fractions_of_ten, exponents_0, 8); + __m256d powers_1 = _mm256_i32gather_pd(exact_fractions_of_ten, exponents_1, 8); + + // Perform division + __m256d results_0 = _mm256_mul_pd(numbers_double_0, powers_0); + __m256d results_1 = _mm256_mul_pd(numbers_double_1, powers_1); + + // Store result + _mm256_storeu_pd(param->write_ptr + 0, results_0); + _mm256_storeu_pd(param->write_ptr + 4, results_1); + + param->write_ptr += 8; + param->exponents_ptr += 8; + param->numbers_ptr += 8; + param->next_block_i += 2; +} + +static inline void decompressAVXBlock1(DecimalIterateParam *param) { + // Load numbers and convert to double + __m128i numbers_int = _mm_loadu_si128(reinterpret_cast<__m128i *>(param->numbers_ptr)); + __m256d numbers_double = _mm256_cvtepi32_pd(numbers_int); + + // Load exponents and gather the power of ten + //__m128i exponents = _mm_loadu_si128(reinterpret_cast<__m128i *>(param->exponents_ptr)); + // gather seems to be the bottleneck + // _mm256_exp10_pd (SVML: Only supported by Intel Compiler) (Sequential!) + // _mm256_round_pd + //__m256d powers = _mm256_i32gather_pd(exact_fractions_of_ten, exponents, sizeof(*exact_fractions_of_ten)); + + // Use a simple set instead of load + gather + __m256d powers = _mm256_set_pd(exact_fractions_of_ten[param->exponents_ptr[3]], + exact_fractions_of_ten[param->exponents_ptr[2]], + exact_fractions_of_ten[param->exponents_ptr[1]], + exact_fractions_of_ten[param->exponents_ptr[0]] + ); + + // Perform division + __m256d results = _mm256_mul_pd(numbers_double, powers); + + // Store result + _mm256_storeu_pd(param->write_ptr, results); + + param->write_ptr += 4; + param->exponents_ptr += 4; + param->numbers_ptr += 4; + param->next_block_i++; +} + +static inline void decompressAVXBlockUnroll(DecimalIterateParam *param, uint32_t limit) { +#if 1 + auto unroll_limit = limit < 3 ? 0 : limit-3; + while(param->next_block_i < unroll_limit) { + decompressAVXBlock4(param); + } +#else + auto unroll_limit = limit < 1 ? 0 : limit-1; + while(param->next_block_i < unroll_limit) { + decompressAVXBlock2(param); + } +#endif + + while (param->next_block_i < limit) { + decompressAVXBlock1(param); + } + + // Write block with exception + decompressExceptionBlock(param); +} + +static inline void decompressAVXBlock(DecimalIterateParam *param, uint32_t limit) { + while (param->next_block_i < limit) { + decompressAVXBlock1(param); + } + + // Write block with exception + decompressExceptionBlock(param); +} +#endif //BTR_USE_SIMD + +void Decimal::decompress(DOUBLE *dest, BitmapWrapper *, const u8 *src, u32 tuple_count, u32 level) +{ + // idea save exceptions in roaring in blocks of 4. + // iterate over roaring. for everything up to the value us vectorized implementation + // for anything value itself use non-vectorized impl + // don't forget last block + + const auto &col_struct = *reinterpret_cast(src); + thread_local std::vector> numbers_v; + auto numbers_ptr = get_level_data(numbers_v, col_struct.converted_count + SIMD_EXTRA_ELEMENTS(INTEGER), level); + thread_local std::vector> exponents_v; + auto exponents_ptr = get_level_data(exponents_v, tuple_count + SIMD_EXTRA_ELEMENTS(INTEGER), level); + thread_local std::vector> patches_v; + auto patches_ptr = get_level_data(patches_v, tuple_count - col_struct.converted_count + SIMD_EXTRA_ELEMENTS(DOUBLE), level); + Roaring exceptions_bitmap = Roaring::read(reinterpret_cast(col_struct.data + col_struct.exceptions_map_offset), false); + + if ( col_struct.converted_count > 0 ) { + IntegerScheme &numbers_scheme = IntegerSchemePicker::MyTypeWrapper::getScheme(col_struct.numbers_scheme); + numbers_scheme.decompress(numbers_v[level].data(), nullptr, col_struct.data, col_struct.converted_count, level + 1); + } + IntegerScheme &exponents_scheme = IntegerSchemePicker::MyTypeWrapper::getScheme(col_struct.exponents_scheme); + exponents_scheme.decompress(exponents_v[level].data(), nullptr, col_struct.data + col_struct.exponents_offset, tuple_count, level + 1); + + DoubleScheme &patches_scheme = DoubleSchemePicker::MyTypeWrapper::getScheme(col_struct.patches_scheme); + patches_scheme.decompress(patches_v[level].data(), nullptr, col_struct.data + col_struct.patches_offset, + tuple_count - col_struct.converted_count, level+1); + +#ifdef BTR_USE_SIMD + if (col_struct.variant_selector & do_iteration) { + struct DecimalIterateParam param = { + .next_block_i = 0, + .tuple_count = tuple_count, + .write_ptr = dest, + .exponents_ptr = exponents_ptr, + .numbers_ptr = numbers_ptr, + .patches_ptr = patches_ptr, + }; + const u32 num_avx_blocks = tuple_count / block_size; // The number of blocks that are complete (have 4 values) + + //if (col_struct.variant_selector & do_unroll) { + if (col_struct.variant_selector & do_unroll) { + exceptions_bitmap.iterate([](uint32_t value, void *param_void) { + auto param = reinterpret_cast(param_void); + decompressAVXBlockUnroll(param, value); + return true; + }, + ¶m); + + // Write remaining blocks + decompressAVXBlockUnroll(¶m, num_avx_blocks); + + } else { + exceptions_bitmap.iterate([](uint32_t value, void *param_void) { + auto param = reinterpret_cast(param_void); + decompressAVXBlock(param, value); + return true; + }, + ¶m); + + // Write remaining blocks + decompressAVXBlock(¶m, num_avx_blocks); + } + } else { + auto write_ptr = dest; + for (u32 row_i = 0; row_i < tuple_count; row_i++) { + INTEGER exponent = *exponents_ptr++; + if (exponent == exponent_exception_code ) { + *write_ptr++ = *patches_ptr++; + } else { + auto number = *numbers_ptr++; + u8 exponent_index = exponent & decimal_index_mask; + DOUBLE original_double = static_cast(number) * exact_fractions_of_ten[exponent_index]; + *write_ptr++ = original_double; + } + } + } +#else // don't use SIMD + auto write_ptr = dest; + for (u32 row_i = 0; row_i < tuple_count; row_i++) { + INTEGER exponent = *exponents_ptr++; + if (exponent == exponent_exception_code) { + *write_ptr++ = *patches_ptr++; + } else { + auto number = *numbers_ptr++; + u8 exponent_index = exponent & decimal_index_mask; + DOUBLE original_double = static_cast(number) * + exact_fractions_of_ten[exponent_index]; + *write_ptr++ = original_double; + } + } +#endif +} + +string Decimal::fullDescription(const u8 *src) { + const auto &col_struct = *reinterpret_cast(src); + string result = this->selfDescription(); + + if ( col_struct.converted_count > 0 ) { + IntegerScheme &sd_scheme = IntegerSchemePicker::MyTypeWrapper::getScheme(col_struct.numbers_scheme); + result += "\n\t-> ([int] significant digits) " + sd_scheme.fullDescription(col_struct.data); + } + + IntegerScheme &e_scheme = IntegerSchemePicker::MyTypeWrapper::getScheme(col_struct.exponents_scheme); + result += "\n\t-> ([int] exponents) " + e_scheme.fullDescription(col_struct.data + col_struct.exponents_offset); + + DoubleScheme &p_scheme = DoubleSchemePicker::MyTypeWrapper::getScheme(col_struct.patches_scheme); + result += "\n\t-> ([double] patches) " + p_scheme.fullDescription(col_struct.data + col_struct.patches_offset); + + return result; +} + +bool Decimal::isUsable(DoubleStats &stats) { + double unique_ratio = static_cast(stats.unique_count) / static_cast(stats.tuple_count); + if (unique_ratio < 0.1) { + return false; + } + return true; +} +// ------------------------------------------------------------------------------------- +} +} +} +} +// ------------------------------------------------------------------------------------- diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/double/Decimal.hpp b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/double/Decimal.hpp new file mode 100644 index 0000000..39327cb --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/double/Decimal.hpp @@ -0,0 +1,58 @@ +#pragma once +#include "datablock/schemes/CScheme.hpp" +#include "roaring/roaring.hh" +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +namespace cengine { +namespace db { +namespace v2 { +namespace d { +const u32 block_size = 4; // Block size adjusted for AVX2 +const u8 do_iteration = (1<<0); +const u8 do_unroll = (1<<1); +// ------------------------------------------------------------------------------------- +struct DecimalStructure { + struct __attribute__((packed)) Slot { + u8 d1 + : 4; + u8 d2 + : 4; + }; + static_assert(sizeof(Slot) == 1, ""); + // ------------------------------------------------------------------------------------- + u32 converted_count; + // ------------------------------------------------------------------------------------- + u8 numbers_scheme; + u8 exponents_scheme; + u8 patches_scheme; + u8 variant_selector; + // ------------------------------------------------------------------------------------- + u32 exponents_offset; + u32 patches_offset; + u32 exceptions_map_offset; + // ------------------------------------------------------------------------------------- + u8 data[]; +}; +// ------------------------------------------------------------------------------------- +class Decimal : public DoubleScheme { +public: + u32 compress(const DOUBLE *src, const BITMAP *nullmap, u8 *dest, DoubleStats &stats, u8 allowed_cascading_level) override; + void decompress(DOUBLE *dest, BitmapWrapper *bitmap, const u8 *src, u32 tuple_count, u32 level) override; + bool isUsable(DoubleStats &stats) override; + std::string fullDescription(const u8 *src) override; + inline virtual DoubleSchemeType schemeType() + { + return staticSchemeType(); + } + inline static DoubleSchemeType staticSchemeType() + { + return DoubleSchemeType::X_DECIMAL; + } +}; +// ------------------------------------------------------------------------------------- +} +} +} +} +// ------------------------------------------------------------------------------------- diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/double/DoubleBP.cpp b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/double/DoubleBP.cpp new file mode 100644 index 0000000..0e6fa14 --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/double/DoubleBP.cpp @@ -0,0 +1,60 @@ +#include "Exceptions.hpp" +#include "Units.hpp" +#include "DoubleBP.hpp" +#include "datablock/schemes/CScheme.hpp" +#include "storage/Chunk.hpp" +#include "utils/Utils.hpp" +// ------------------------------------------------------------------------------------- +#include "extern/FastPFOR.hpp" +// ------------------------------------------------------------------------------------- +#include "gflags/gflags.h" +// ------------------------------------------------------------------------------------- +#include +// ------------------------------------------------------------------------------------- +namespace cengine { +namespace db { +namespace v2 { +namespace d { +// ------------------------------------------------------------------------------------- +u32 DoubleBP::compress(const DOUBLE *src, const BITMAP *, u8 *dest, DoubleStats& stats, [[maybe_unused]] u8 allowed_cascading_level) +{ + auto &col_struct = *reinterpret_cast (dest); + // ------------------------------------------------------------------------------------- + FBPImpl codec; + // ------------------------------------------------------------------------------------- + auto dest_integer = reinterpret_cast(col_struct.data); + u64 padding = dest_integer; + dest_integer = (dest_integer + 3) & ~3ul; + col_struct.padding = dest_integer - padding; + auto dest_4_aligned = reinterpret_cast(dest_integer); + // ------------------------------------------------------------------------------------- + size_t compressed_codes_size = 4*stats.tuple_count + 1024; // give FBP a large enough output buffer + // 2x tuple count because we're actually compressing doubles + codec.compress(reinterpret_cast(src), stats.tuple_count*2, dest_4_aligned, compressed_codes_size); + col_struct.u32_count = compressed_codes_size; + // ------------------------------------------------------------------------------------- + return sizeof(DoubleBPStructure) + compressed_codes_size * sizeof(u32); +} +// ------------------------------------------------------------------------------------- +void DoubleBP::decompress(DOUBLE *dest, [[maybe_unused]] BitmapWrapper *, const u8 *src, u32 tuple_count, [[maybe_unused]] u32 level) +{ + auto &col_struct = *reinterpret_cast (src); + // ------------------------------------------------------------------------------------- + FBPImpl codec; + SIZE decompressed_codes_size; + auto encoded_array = const_cast(reinterpret_cast(col_struct.data + col_struct.padding)); + const u32* dst_ptr = codec.decompress(encoded_array, col_struct.u32_count, reinterpret_cast(dest), decompressed_codes_size); + if ( dst_ptr != encoded_array + col_struct.u32_count ) { + throw Generic_Exception("Decompressing DoubleBP failed"); + } +} +// ------------------------------------------------------------------------------------- +DOUBLE DoubleBP::lookup(u32) { UNREACHABLE(); }; +// ------------------------------------------------------------------------------------- +void DoubleBP::scan(Predicate, BITMAP *, const u8 *, u32) { UNREACHABLE(); }; +// ------------------------------------------------------------------------------------- +} +} +} +} +// ------------------------------------------------------------------------------------- diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/double/DoubleBP.hpp b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/double/DoubleBP.hpp new file mode 100644 index 0000000..1df4ad5 --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/double/DoubleBP.hpp @@ -0,0 +1,37 @@ +#pragma once +#include "datablock/schemes/CScheme.hpp" +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +namespace cengine { +namespace db { +namespace v2 { +namespace d { +// ------------------------------------------------------------------------------------- +struct DoubleBPStructure { + u32 u32_count; // number of 4 bytes written by FastBP + u8 padding; + u8 data[]; +}; +// ------------------------------------------------------------------------------------- +class DoubleBP : public DoubleScheme { +public: + u32 compress(const DOUBLE *src, const BITMAP *nullmap, u8 *dest, DoubleStats &stats, u8 allowed_cascading_level) override; + void decompress(DOUBLE *dest, BitmapWrapper *bitmap, const u8 *src, u32 tuple_count, u32 level) override; + + inline virtual DoubleSchemeType schemeType() override + { + return staticSchemeType(); + } + inline static DoubleSchemeType staticSchemeType() + { + return DoubleSchemeType::DOUBLE_BP; + } + DOUBLE lookup(u32); + void scan(Predicate, BITMAP *, const u8 *, u32); +}; +// ------------------------------------------------------------------------------------- +} +} +} +} +// ------------------------------------------------------------------------------------- diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/double/DynamicDictionary.cpp b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/double/DynamicDictionary.cpp new file mode 100644 index 0000000..7dab3a5 --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/double/DynamicDictionary.cpp @@ -0,0 +1,44 @@ +#include "Units.hpp" +#include "DynamicDictionary.hpp" +#include "datablock/schemes/v2/templated/DynamicDictionary.hpp" +#include "datablock/schemes/CScheme.hpp" +#include "datablock/schemes/CSchemePicker.hpp" +#include "storage/Chunk.hpp" +// -------------------------------------------------------+------------------------------ +#include "gflags/gflags.h" +#include "spdlog/spdlog.h" +// ------------------------------------------------------------------------------------- +#include +#include +// ------------------------------------------------------------------------------------- +namespace cengine { +namespace db { +namespace v2 { +namespace d { +// ------------------------------------------------------------------------------------- +using MyDynamicDictionary = TDynamicDictionary; +// ------------------------------------------------------------------------------------- +double DynamicDictionary::expectedCompressionRatio(cengine::db::DoubleStats &stats, u8 allowed_cascading_level) +{ + return MyDynamicDictionary::expectedCompressionRatio(stats, allowed_cascading_level); +} +// ------------------------------------------------------------------------------------- +u32 DynamicDictionary::compress(const DOUBLE *src, const BITMAP *nullmap, u8 *dest, DoubleStats &stats, u8 allowed_cascading_level) +{ + return MyDynamicDictionary::compressColumn(src, nullmap, dest, stats, allowed_cascading_level); +} +// ------------------------------------------------------------------------------------- +void DynamicDictionary::decompress(DOUBLE *dest, BitmapWrapper *nullmap, const u8 *src, u32 tuple_count, u32 level) +{ + return MyDynamicDictionary::decompressColumn(dest, nullmap, src, tuple_count, level); +} + +string DynamicDictionary::fullDescription(const u8 *src) { + return MyDynamicDictionary::fullDescription(src, this->selfDescription()); +} +// ------------------------------------------------------------------------------------- +} +} +} +} +// ------------------------------------------------------------------------------------- diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/double/DynamicDictionary.hpp b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/double/DynamicDictionary.hpp new file mode 100644 index 0000000..40946bd --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/double/DynamicDictionary.hpp @@ -0,0 +1,31 @@ +#pragma once +#include "datablock/schemes/CScheme.hpp" +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +namespace cengine { +namespace db { +namespace v2 { +namespace d { +// ------------------------------------------------------------------------------------- +class DynamicDictionary : public DoubleScheme { +public: + virtual double expectedCompressionRatio(DoubleStats &stats, u8 allowed_cascading_level); + u32 compress(const DOUBLE *src, const BITMAP *nullmap, u8 *dest, DoubleStats &stats, u8 allowed_cascading_level) override; + void decompress(DOUBLE *dest, BitmapWrapper *bitmap, const u8 *src, u32 tuple_count, u32 level) override; + std::string fullDescription(const u8 *src) override; + inline virtual DoubleSchemeType schemeType() + { + return staticSchemeType(); + } + inline static DoubleSchemeType staticSchemeType() + { + return DoubleSchemeType::X_DICT; + } +}; +// ------------------------------------------------------------------------------------- +} +} +} +} +// ------------------------------------------------------------------------------------- diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/double/Frequency.cpp b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/double/Frequency.cpp new file mode 100644 index 0000000..cc7b81f --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/double/Frequency.cpp @@ -0,0 +1,49 @@ +#include "Units.hpp" +#include "Frequency.hpp" +#include "datablock/schemes/v2/templated/Frequency.hpp" +#include "datablock/schemes/CScheme.hpp" +#include "datablock/schemes/CSchemePicker.hpp" +#include "storage/Chunk.hpp" +// -------------------------------------------------------+------------------------------ +#include "gflags/gflags.h" +#include "spdlog/spdlog.h" +// ------------------------------------------------------------------------------------- +#include +#include +// ------------------------------------------------------------------------------------- +DEFINE_uint32(frequency_threshold_pct, 50, ""); +// ------------------------------------------------------------------------------------- +namespace cengine { +namespace db { +namespace v2 { +namespace d { +// ------------------------------------------------------------------------------------- +using MyFrequency = TFrequency; +// ------------------------------------------------------------------------------------- +double Frequency::expectedCompressionRatio(DoubleStats &stats, u8 allowed_cascading_level) +{ + if ( CD(stats.unique_count) * 100.0 / CD(stats.tuple_count) > FLAGS_frequency_threshold_pct ) { + return 0; + } + return DoubleScheme::expectedCompressionRatio(stats, allowed_cascading_level); +} +// ------------------------------------------------------------------------------------- +u32 Frequency::compress(const DOUBLE *src, const BITMAP *nullmap, u8 *dest, DoubleStats &stats, u8 allowed_cascading_level) +{ + return MyFrequency::compressColumn(src, nullmap, dest, stats, allowed_cascading_level); +} +// ------------------------------------------------------------------------------------- +void Frequency::decompress(DOUBLE *dest, BitmapWrapper *nullmap, const u8 *src, u32 tuple_count, u32 level) +{ + return MyFrequency::decompressColumn(dest, nullmap, src, tuple_count, level); +} + +string Frequency::fullDescription(const u8 *src) { + return MyFrequency::fullDescription(src, this->selfDescription()); +} +// ------------------------------------------------------------------------------------- +} +} +} +} +// ------------------------------------------------------------------------------------- diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/double/Frequency.hpp b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/double/Frequency.hpp new file mode 100644 index 0000000..51c1744 --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/double/Frequency.hpp @@ -0,0 +1,31 @@ +#pragma once +#include "datablock/schemes/CScheme.hpp" +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +namespace cengine { +namespace db { +namespace v2 { +namespace d { +// ------------------------------------------------------------------------------------- +class Frequency : public DoubleScheme { +public: + double expectedCompressionRatio(DoubleStats &stats, u8 allowed_cascading_level) override; + u32 compress(const DOUBLE *src, const BITMAP *nullmap, u8 *dest, DoubleStats &stats, u8 allowed_cascading_level) override; + void decompress(DOUBLE *dest, BitmapWrapper *bitmap, const u8 *src, u32 tuple_count, u32 level) override; + std::string fullDescription(const u8 *src) override; + inline virtual DoubleSchemeType schemeType() override + { + return staticSchemeType(); + } + inline static DoubleSchemeType staticSchemeType() + { + return DoubleSchemeType::X_FREQUENCY; + } +}; +// ------------------------------------------------------------------------------------- +} +} +} +} +// ------------------------------------------------------------------------------------- diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/double/GDHacky.xpp b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/double/GDHacky.xpp new file mode 100644 index 0000000..c22b52a --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/double/GDHacky.xpp @@ -0,0 +1,135 @@ +#include "Units.hpp" +#include "Reinterpret.hpp" +#include "Hacky.hpp" +#include "datablock/schemes/CScheme.hpp" +#include "datablock/schemes/CSchemePicker.hpp" +#include "storage/Chunk.hpp" +#include "bitpack.h" +// ------------------------------------------------------------------------------------- +#include "gflags/gflags.h" +#include "spdlog/spdlog.h" +#include "double-conversion/fast-dtoa.h" +#include "roaring/roaring.hh" +// ------------------------------------------------------------------------------------- +#include +#include +#include +#include +// ------------------------------------------------------------------------------------- +namespace cengine { +namespace db { +namespace v2 { +namespace d { +// ------------------------------------------------------------------------------------- +static const double exact_powers_of_ten[] = { + 1.0, // 10^0 + 10.0, 100.0, 1000.0, 10000.0, 100000.0, 1000000.0, 10000000.0, 100000000.0, 1000000000.0, 10000000000.0, // 10^10 + 100000000000.0, 1000000000000.0, 10000000000000.0, 100000000000000.0, 1000000000000000.0, 10000000000000000.0, 100000000000000000.0, 1000000000000000000.0, 10000000000000000000.0, 100000000000000000000.0, // 10^20 + 1000000000000000000000.0, + // 10^22 = 0x21e19e0c9bab2400000 = 0x878678326eac9 * 2^22 + 10000000000000000000000.0 +}; +// ------------------------------------------------------------------------------------- +double Hacky::expectedCompressionRatio(DoubleStats &stats) +{ + return 1.1; +} +// ------------------------------------------------------------------------------------- +u32 Hacky::compress(const DOUBLE *src, const BITMAP *, u8 *dest, DoubleStats &stats, u8) +{ + // ------------------------------------------------------------------------------------- + auto &col_struct = *reinterpret_cast(dest); + auto write_ptr = col_struct.data; + // ------------------------------------------------------------------------------------- + // Init gdouble + char buffer_container[1024]; + double_conversion::Vector buffer(buffer_container, 1024); + int length = 0; + int point = 0; + // ------------------------------------------------------------------------------------- + vector printed_integer; + vector printed_point; + vector flags; + vector exceptions; + u32 failure_counter = 0; + const u8 flag_neg = 0, flag_pos = 1, flag_fail = 2; + // ------------------------------------------------------------------------------------- + for ( u32 row_i = 0; row_i < stats.tuple_count; row_i++ ) { + point = 0; + auto current_double = src[row_i]; + if ( current_double == 0 ) { + flags.push_back(3); + } else { + if(current_double < 0) + current_double = std::copysign(current_double, +1.0); + int status = FastDtoa(current_double, double_conversion::FAST_DTOA_SHORTEST, 0, + buffer, &length, &point); + if ( status == 0 ) { + exceptions.push_back(current_double); + flags.push_back(flag_fail); + failure_counter++; + continue; + } + failure_counter += !status; + printed_integer.push_back(std::stoull(buffer.start())); + printed_point.push_back(point); +// { +// u64 sd = std::stoull(buffer.start()); +// u32 length = CU(std::floor(std::log10(sd)) + 1); +// s32 exponent = point - length; +// double base_double = CD(sd); +// if ( exponent > 0 ) { +// check(status == 0 || current_double == base_double * exact_powers_of_ten[exponent]); +// } else { +// check(status == 0 || current_double == base_double / exact_powers_of_ten[-exponent]); +// } +// } + } + } + IntegerSchemePicker::compress(printed_point.data(), nullptr, write_ptr, printed_point.size(), 2, col_struct.integers_offset, col_struct.points_scheme_code); + write_ptr += col_struct.integers_offset; + // ------------------------------------------------------------------------------------- + write_ptr += bitnpack64(printed_integer.data(), printed_integer.size(), write_ptr); + // ------------------------------------------------------------------------------------- + // hack the numbers + write_ptr += exceptions.size() * 8; + write_ptr += static_cast(stats.tuple_count * 2.0 / 8.0); + return write_ptr - dest; +} +// ------------------------------------------------------------------------------------- +void Hacky::decompress(DOUBLE *dest, BitmapWrapper *, const u8 *src, u32 tuple_count, u32 level) +{ + auto &col_struct = *reinterpret_cast(src); + // ------------------------------------------------------------------------------------- + thread_local std::vector> printed_point_v; + auto printed_point = get_data(printed_point_v, tuple_count, level); + IntegerScheme &scheme = IntegerSchemePicker::MyTypeWrapper::getScheme(col_struct.points_scheme_code); + scheme.decompress(printed_point, nullptr, col_struct.data, tuple_count, level+1); + // ------------------------------------------------------------------------------------- + thread_local std::vector> printed_integer_v; + auto printed_integer(printed_integer_v, tuple_count, level); + bitnunpack64(const_cast(col_struct.data + col_struct.integers_offset), tuple_count, printed_integer); + // ------------------------------------------------------------------------------------- + for ( u32 row_i = 0; row_i < tuple_count; row_i++ ) { + auto sd = printed_integer[row_i]; + u32 point = printed_point[row_i]; + if ( sd == 0 && point == 0 ) { + dest[row_i] = 0.0; + } else { + u32 length = CU(std::floor(std::log10(printed_integer[row_i])) + 1); + s32 exponent = point - length; + double base_double = CD(sd); + if ( exponent > 0 ) { + dest[row_i] = base_double * exact_powers_of_ten[exponent]; + } else { + dest[row_i] = base_double / exact_powers_of_ten[-exponent]; + } + } + } +} +// ------------------------------------------------------------------------------------- +} +} +} +} +// ------------------------------------------------------------------------------------- diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/double/Hacky.cpp b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/double/Hacky.cpp new file mode 100644 index 0000000..f9fba99 --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/double/Hacky.cpp @@ -0,0 +1,204 @@ +#include "Units.hpp" +#include "Reinterpret.hpp" +#include "Hacky.hpp" +#include "datablock/schemes/CScheme.hpp" +#include "datablock/schemes/CSchemePicker.hpp" +#include "datablock/schemes/v2/integer/PBP.hpp" +#include "storage/Chunk.hpp" +// ------------------------------------------------------------------------------------- +#include "gflags/gflags.h" +#include "spdlog/spdlog.h" +#include "double-conversion/fast-dtoa.h" +#include "roaring/roaring.hh" +// ------------------------------------------------------------------------------------- +#include +#include +#include +#include +// ------------------------------------------------------------------------------------- +DEFINE_uint32(hacky, 120, ""); +DEFINE_uint32(hacky_min_exponent, 10, ""); +DEFINE_uint32(hacky_min_occurrence_count, 1000, ""); +// ------------------------------------------------------------------------------------- +namespace cengine { +namespace db { +namespace v2 { +namespace d { +// ------------------------------------------------------------------------------------- +double Hacky::expectedCompressionRatio(DoubleStats &stats, u8 allowed_cascading_level) +{ + if ( stats.tuple_count < 65000 || stats.unique_count * 3 >= stats.total_size ) + return 0; + return DoubleScheme::expectedCompressionRatio(stats, allowed_cascading_level); +} +// ------------------------------------------------------------------------------------- +u32 Hacky::compress(const DOUBLE *src, const BITMAP *nullmap, u8 *dest, DoubleStats &stats, u8 allowed_cascading_level) +{ + // ------------------------------------------------------------------------------------- + auto &col_struct = *reinterpret_cast(dest); + auto write_ptr = col_struct.data; + // ------------------------------------------------------------------------------------- + vector mantissa_v; + vector exponent_v; + vector sign_exponent_v; + // ------------------------------------------------------------------------------------- + for ( u32 row_i = 0; row_i < stats.tuple_count; row_i++ ) { + u64 sign_exponent = RU64(src[row_i]) & 0xFFF0000000000000; + sign_exponent = sign_exponent >> 52; + sign_exponent_v.push_back(static_cast(sign_exponent)); + // ------------------------------------------------------------------------------------- + exponent_v.push_back(sign_exponent & 0x7FFF); + // ------------------------------------------------------------------------------------- + u64 mantissa = RU64(src[row_i]) & 0x000FFFFFFFFFFFFF; + mantissa_v.push_back(mantissa); + } + // ------------------------------------------------------------------------------------- + die_if(mantissa_v.size() == sign_exponent_v.size()); + // ------------------------------------------------------------------------------------- + { + col_struct.sign_exponent_offset = write_ptr - col_struct.data; + u32 used_space; + IntegerSchemePicker::compress(sign_exponent_v.data(), nullmap, write_ptr, sign_exponent_v.size(), allowed_cascading_level - 1, used_space, col_struct.sign_exponent_code, AUTO_SCHEME, "sign_exponents"); + write_ptr += used_space; + } + // ------------------------------------------------------------------------------------- + NumberStats exponent_stats = NumberStats::generateStats(exponent_v.data(), nullmap, exponent_v.size()); + NumberStats sign_exponent_stats = NumberStats::generateStats(sign_exponent_v.data(), nullmap, sign_exponent_v.size()); +// NumberStats mantissa_stats = NumberStats::generateStats(mantissa_v.data(), nullptr, mantissa_v.size()); + // ------------------------------------------------------------------------------------- + col_struct.common_exponent = exponent_stats.distinct_values.begin()->first; + u32 occurence_count = exponent_stats.distinct_values.begin()->second; + for ( const auto &t : exponent_stats.distinct_values ) { + if ( t.second > occurence_count ) { + occurence_count = t.second; + col_struct.common_exponent = t.first; + } + } + col_struct.common_exponent -= 1023; + // ------------------------------------------------------------------------------------- + if ( stats.min >= 0 && col_struct.common_exponent > 0 ) { + col_struct.three_way_split = true; + col_struct.common_exponent = std::min(col_struct.common_exponent, s16(31)); + // ------------------------------------------------------------------------------------- + vector opt_mantissa_top_v; + vector opt_mantissa_bot_v; + // ------------------------------------------------------------------------------------- + for ( u32 row_i = 0; row_i < stats.tuple_count; row_i++ ) { + u64 opt_mantissa = RU64(src[row_i]) & 0x000FFFFFFFFFFFFF; + // ------------------------------------------------------------------------------------- + u64 opt_mantissa_top = opt_mantissa >> (52 - col_struct.common_exponent); + opt_mantissa_top_v.push_back(static_cast(opt_mantissa_top)); // top 23 bits + // ------------------------------------------------------------------------------------- + u64 opt_mantissa_bot = opt_mantissa << (12 + col_struct.common_exponent); + opt_mantissa_bot = opt_mantissa_bot >> (12 + col_struct.common_exponent); + opt_mantissa_bot_v.push_back(opt_mantissa_bot); + } + NumberStats opt_mantissa_top_stats = NumberStats::generateStats(opt_mantissa_top_v.data(), nullmap, opt_mantissa_top_v.size()); + NumberStats opt_mantissa_bot_stats = NumberStats::generateStats(opt_mantissa_bot_v.data(), nullmap, opt_mantissa_bot_v.size()); + // ------------------------------------------------------------------------------------- + { + col_struct.mantissa_top_offset = write_ptr - col_struct.data; + u32 used_space; + IntegerSchemePicker::compress(opt_mantissa_top_v.data(), nullmap, write_ptr, opt_mantissa_top_v.size(), allowed_cascading_level - 1, used_space, col_struct.mantissa_top_code, AUTO_SCHEME, "mantissa_top"); + write_ptr += used_space; + } + // ------------------------------------------------------------------------------------- + { + col_struct.mantissa_bottom_offset = write_ptr - col_struct.data; + u32 used_space = integer::FBP64::compress(opt_mantissa_bot_v.data(), write_ptr, opt_mantissa_bot_v.size()); + write_ptr += used_space; + } + // ------------------------------------------------------------------------------------- + return write_ptr - dest; + } else { + col_struct.three_way_split = false; + col_struct.mantissa_top_offset = write_ptr - col_struct.data; + // ------------------------------------------------------------------------------------- + u32 used_space = integer::FBP64::compress(mantissa_v.data(), write_ptr, mantissa_v.size()); + write_ptr += used_space; + return write_ptr - dest; + } +} +// ------------------------------------------------------------------------------------- + void Hacky::decompress(DOUBLE *dest, BitmapWrapper *, const u8 *src, u32 tuple_count, u32 level) +{ + // ------------------------------------------------------------------------------------- + auto &col_struct = *reinterpret_cast(src); + thread_local std::vector> sign_exponent_v; + auto sign_exponent = get_level_data(sign_exponent_v, tuple_count + SIMD_EXTRA_ELEMENTS(INTEGER), level); + { + IntegerScheme &scheme = IntegerSchemePicker::MyTypeWrapper::getScheme(col_struct.sign_exponent_code); + scheme.decompress(sign_exponent, nullptr, col_struct.data + col_struct.sign_exponent_offset, + tuple_count, + level+1); + } + auto write_ptr = reinterpret_cast(dest); + if ( col_struct.three_way_split ) { + // ------------------------------------------------------------------------------------- + thread_local std::vector> mantissa_top_v; + auto mantissa_top = get_level_data(mantissa_top_v, tuple_count + SIMD_EXTRA_ELEMENTS(INTEGER), level); + { + IntegerScheme &scheme = IntegerSchemePicker::MyTypeWrapper::getScheme(col_struct.mantissa_top_code); + scheme.decompress(mantissa_top, nullptr, col_struct.data + col_struct.mantissa_top_offset, + tuple_count, level+1); + } + // ------------------------------------------------------------------------------------- + thread_local std::vector> mantissa_bot_v; + auto mantissa_bot = get_level_data(mantissa_bot_v, tuple_count * 2, level); + { + integer::FBP64::decompress(reinterpret_cast(mantissa_bot), col_struct.data + col_struct.mantissa_bottom_offset, tuple_count, level+1); + } + // ------------------------------------------------------------------------------------- + for ( u32 row_i = 0; row_i < tuple_count; row_i++ ) { + write_ptr[row_i] = sign_exponent[row_i]; + write_ptr[row_i] = write_ptr[row_i] << 52; + // ------------------------------------------------------------------------------------- + u64 tmp = mantissa_top[row_i]; + tmp = tmp << (52 - col_struct.common_exponent); + write_ptr[row_i] = write_ptr[row_i] | tmp; + // ------------------------------------------------------------------------------------- +// write_ptr[row_i] = write_ptr[row_i] >> (52 - col_struct.common_exponent); +// write_ptr[row_i] = write_ptr[row_i] << (52 - col_struct.common_exponent); + write_ptr[row_i] = write_ptr[row_i] | mantissa_bot[row_i]; + } + } else { + thread_local std::vector> mantissa_v; + auto mantissa = get_level_data(mantissa_v, tuple_count + 100, level); + // ------------------------------------------------------------------------------------- + integer::FBP64::decompress(reinterpret_cast(mantissa), col_struct.data + col_struct.mantissa_top_offset, tuple_count, level+1); + for ( u32 row_i = 0; row_i < tuple_count; row_i++ ) { + write_ptr[row_i] = sign_exponent[row_i]; + write_ptr[row_i] = write_ptr[row_i] << 52; + // ------------------------------------------------------------------------------------- + write_ptr[row_i] = write_ptr[row_i] | mantissa[row_i]; + } + } + +} + +string Hacky::fullDescription(const u8 *src) { + auto &col_struct = *reinterpret_cast(src); + string result = this->selfDescription(); + { + IntegerScheme &scheme = IntegerSchemePicker::MyTypeWrapper::getScheme(col_struct.sign_exponent_code); + result += "\n\t-> ([int] sign exponent) " + scheme.fullDescription(col_struct.data + col_struct.sign_exponent_offset); + } + + if ( col_struct.three_way_split ) { + { + IntegerScheme &scheme = IntegerSchemePicker::MyTypeWrapper::getScheme(col_struct.mantissa_top_code); + result += "\n\t-> ([int] mantissa top) " + scheme.fullDescription(col_struct.data + col_struct.mantissa_top_offset); + } + result += "\n\t-> ([int] mantissa bottom) FBP64"; + } else { + result += "\n\t-> ([int] mantissa) FBP64"; + } + + return result; +} +// ------------------------------------------------------------------------------------- +} +} +} +} +// ------------------------------------------------------------------------------------- diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/double/Hacky.hpp b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/double/Hacky.hpp new file mode 100644 index 0000000..5720088 --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/double/Hacky.hpp @@ -0,0 +1,48 @@ +#pragma once +#include "datablock/schemes/CScheme.hpp" +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +namespace cengine { +namespace db { +namespace v2 { +namespace d { +// ------------------------------------------------------------------------------------- +struct HackyStructure { + bool three_way_split; + // ------------------------------------------------------------------------------------- + s16 common_exponent; + // ------------------------------------------------------------------------------------- + u8 sign_exponent_code; + u32 sign_exponent_offset; + // ------------------------------------------------------------------------------------- + u32 mantissa_top_offset; + u8 mantissa_top_code; + // ------------------------------------------------------------------------------------- + u32 mantissa_bottom_offset; + u8 mantissa_bottom_code; + // ------------------------------------------------------------------------------------- + u8 data[]; +}; +// ------------------------------------------------------------------------------------- +class Hacky : public DoubleScheme { +public: + double expectedCompressionRatio(DoubleStats &stats, u8 allowed_cascading_level) override; + u32 compress(const DOUBLE *src, const BITMAP *nullmap, u8 *dest, DoubleStats &stats, u8 allowed_cascading_level) override; + void decompress(DOUBLE *dest, BitmapWrapper *bitmap, const u8 *src, u32 tuple_count, u32 level) override; + std::string fullDescription(const u8 *src) override; + inline virtual DoubleSchemeType schemeType() + { + return staticSchemeType(); + } + inline static DoubleSchemeType staticSchemeType() + { + return DoubleSchemeType::X_HACKY; + } +}; +// ------------------------------------------------------------------------------------- +} +} +} +} +// ------------------------------------------------------------------------------------- diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/double/MaxExponent.cpp b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/double/MaxExponent.cpp new file mode 100644 index 0000000..03d48c1 --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/double/MaxExponent.cpp @@ -0,0 +1,174 @@ +#include "Units.hpp" +#include "MaxExponent.hpp" +#include "datablock/schemes/CScheme.hpp" +#include "datablock/schemes/CSchemePicker.hpp" +#include "storage/Chunk.hpp" +// ------------------------------------------------------------------------------------- +#include "gflags/gflags.h" +#include "spdlog/spdlog.h" +#include "roaring/roaring.hh" +// ------------------------------------------------------------------------------------- +#include +#include +#include +// ------------------------------------------------------------------------------------- +DEFINE_uint32(MaxExponent_siginifcant_digit_bits_limit, 64, ""); +// ------------------------------------------------------------------------------------- +using namespace std; +void printDouble(double input) +{ + union { + double d; + uint64_t u; + }; + + d = input; + bool sign = (u >> 63) & 0x1; + uint64_t exponent = (u >> 52) & 0x7FF; + uint64_t mantissa = u & 0xFFFFFFFFFFFFF; + + cout << sign << " " << bitset<11>(exponent) << " " << bitset<52>(mantissa) << " " + << std::setprecision(17) << d << endl; +} + +namespace cengine { +namespace db { +namespace v2 { +namespace d { + +const u8 max_exponent = 22; +const u8 exponent_exception_code = 23; +static const double exact_powers_of_ten[] = { + 1.0, // 10^0 + 10.0, 100.0, 1000.0, 10000.0, 100000.0, 1000000.0, 10000000.0, 100000000.0, 1000000000.0, 10000000000.0, // 10^10 + 100000000000.0, 1000000000000.0, 10000000000000.0, 100000000000000.0, 1000000000000000.0, 10000000000000000.0, 100000000000000000.0, 1000000000000000000.0, 10000000000000000000.0, 100000000000000000000.0, // 10^20 + 1000000000000000000000.0, + // 10^22 = 0x21e19e0c9bab2400000 = 0x878678326eac9 * 2^22 + 10000000000000000000000.0 +}; +// ------------------------------------------------------------------------------------- +/* + * Plan: + * 1- find the maximum exponent + * 2- compress all, exceptions are stored in bitmap + * only one exponent saved in structure. + * + * not necessarily slower than decimal, there is a chance to optimize, e.g don't start from e=0 but from the latest max_exponent + * + */ +u32 MaxExponent::compress(const DOUBLE *src, const BITMAP *nullmap, u8 *dest, DoubleStats &stats, u8 allowed_cascading_level) +{ + // Layout : Header | sd_v | e_v | p_v + //ignore bitmap + auto &col_struct = *reinterpret_cast(dest); + vector sd_v; + vector p_v; // patches + u32 not_convertable = 0; + for ( u32 row_i = 0; row_i < stats.tuple_count; row_i++ ) { + u32 e; + u64 sd; + bool convertable = false; + double current_double = src[row_i]; + if ( std::signbit(current_double)) { + current_double = std::copysign(current_double, +1.0); + } + for ( e = 0; e <= max_exponent; e++ ) { + double cd = current_double * exact_powers_of_ten[e]; + cd = std::round(cd); + sd = static_cast(cd); + double if_converted_back = CD(sd) / exact_powers_of_ten[e]; + if ( if_converted_back == current_double && ((std::floor(std::log2(sd)) + 1) <= FLAGS_MaxExponent_siginifcant_digit_bits_limit)) { + convertable = true; + break; + } + } + if ( convertable ) { + die_if((e & 0x20) == 0); + if ( col_struct.max_exponent < e ) { + col_struct.max_exponent = e; + } + } else if(nullmap != nullptr && nullmap[row_i]){ + cout << row_i << endl; + cout << std::fixed; + cout.precision(std::numeric_limits< double >::max_digits10); + double wtf = current_double * 1000000.0; + cout << src[row_i] << '\t' << current_double << '\t' << wtf<< endl; + not_convertable++; + printDouble(current_double); + } + } + + Roaring exceptions_bitmap; + Roaring negative_bitmap; + + for ( u32 row_i = 0; row_i < stats.tuple_count; row_i++ ) { + u64 sd; + bool convertable = false; + double current_double = src[row_i]; + if ( std::signbit(current_double)) { + current_double = std::copysign(current_double, +1.0); + negative_bitmap.add(row_i); + } + + double cd = current_double * exact_powers_of_ten[col_struct.max_exponent]; + cd = std::round(cd); + sd = static_cast(cd); + double if_converted_back = CD(sd) / exact_powers_of_ten[col_struct.max_exponent]; + if ( if_converted_back == current_double && ((std::floor(std::log2(sd)) + 1) <= FLAGS_MaxExponent_siginifcant_digit_bits_limit)) { + convertable = true; + } + if ( convertable ) { + sd_v.push_back(static_cast(sd)); + } else { + exceptions_bitmap.add(row_i); + p_v.push_back(src[row_i]); + } + } + // ------------------------------------------------------------------------------------- + exceptions_bitmap.runOptimize(); + exceptions_bitmap.setCopyOnWrite(true); + negative_bitmap.runOptimize(); + negative_bitmap.setCopyOnWrite(true); + // ------------------------------------------------------------------------------------- + col_struct.converted_count = sd_v.size(); + // ------------------------------------------------------------------------------------- + auto write_ptr = col_struct.data; + write_ptr += exceptions_bitmap.write(reinterpret_cast(write_ptr), false); + col_struct.negatives_bitmap_offset = write_ptr - col_struct.data; + write_ptr += negative_bitmap.write(reinterpret_cast(write_ptr), false); + // ------------------------------------------------------------------------------------- + // Compress significant digits + if ( sd_v.size()) { + col_struct.sd_offset = write_ptr - col_struct.data; + u32 used_space; + IntegerSchemePicker::compress(sd_v.data(), nullptr, write_ptr, sd_v.size(), allowed_cascading_level - 1, used_space, col_struct.sd_scheme, AUTO_SCHEME, "significant digits"); + write_ptr += used_space; + // ------------------------------------------------------------------------------------- + spdlog::debug("MaxExponent: sd_c = {} sd_s = {}", CI(col_struct.sd_scheme), CI(used_space)); + // ------------------------------------------------------------------------------------- + } + // ------------------------------------------------------------------------------------- + // Compress patches + { + col_struct.p_offset = write_ptr - col_struct.data; + u32 used_space; + DoubleSchemePicker::compress(p_v.data(), nullptr, write_ptr, p_v.size(), allowed_cascading_level - 1, used_space, col_struct.p_scheme, AUTO_SCHEME, "patches"); + write_ptr += used_space; + // ------------------------------------------------------------------------------------- + spdlog::debug("MaxExponent: p_c = {} p_s = {}", CI(col_struct.p_scheme), CI(used_space)); + // ------------------------------------------------------------------------------------- + } + // ------------------------------------------------------------------------------------- + return write_ptr - dest; +} +// ------------------------------------------------------------------------------------- +void MaxExponent::decompress(DOUBLE *dest, BitmapWrapper *, const u8 *src, u32 tuple_count, u32 level) +{ + +} +// ------------------------------------------------------------------------------------- +} +} +} +} +// ------------------------------------------------------------------------------------- diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/double/MaxExponent.hpp b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/double/MaxExponent.hpp new file mode 100644 index 0000000..aa780cc --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/double/MaxExponent.hpp @@ -0,0 +1,45 @@ +#pragma once +#include "datablock/schemes/CScheme.hpp" +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +namespace cengine { +namespace db { +namespace v2 { +namespace d { +// ------------------------------------------------------------------------------------- +struct MaxExponentStructure { + u32 negatives_bitmap_offset; + u32 exceptions_offset; + u8 max_exponent; + // ------------------------------------------------------------------------------------- + u32 converted_count; + // ------------------------------------------------------------------------------------- + u8 sd_scheme; + u8 p_scheme; + // ------------------------------------------------------------------------------------- + u32 sd_offset; + u32 p_offset; + // ------------------------------------------------------------------------------------- + u8 data[]; +}; +// ------------------------------------------------------------------------------------- +class MaxExponent : public DoubleScheme { +public: + u32 compress(const DOUBLE *src, const BITMAP *nullmap, u8 *dest, DoubleStats &stats, u8 allowed_cascading_level) override; + void decompress(DOUBLE *dest, BitmapWrapper *bitmap, const u8 *src, u32 tuple_count, u32 level) override; + inline virtual DoubleSchemeType schemeType() + { + return staticSchemeType(); + } + inline static DoubleSchemeType staticSchemeType() + { + return DoubleSchemeType::X_DECIMAL; + } +}; +// ------------------------------------------------------------------------------------- +} +} +} +} +// ------------------------------------------------------------------------------------- diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/double/PBP.hpp b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/double/PBP.hpp new file mode 100644 index 0000000..fbe4f99 --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/double/PBP.hpp @@ -0,0 +1,6 @@ +#ifndef PBP_H_ +#define PBP_H_ + + + +#endif // PBP_H_ diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/double/RLE.cpp b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/double/RLE.cpp new file mode 100644 index 0000000..4193e8f --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/double/RLE.cpp @@ -0,0 +1,46 @@ +#include "Units.hpp" +#include "RLE.hpp" +#include "datablock/schemes/v2/templated/RLE.hpp" +#include "datablock/schemes/CScheme.hpp" +#include "datablock/schemes/CSchemePicker.hpp" +// -------------------------------------------------------+------------------------------ +#include "gflags/gflags.h" +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +DEFINE_uint32(d_rle_force_values_scheme, AUTO_SCHEME, ""); +DEFINE_uint32(d_rle_force_counts_scheme, AUTO_SCHEME, ""); +// ------------------------------------------------------------------------------------- +namespace cengine { +namespace db { +namespace v2 { +namespace d { +// ------------------------------------------------------------------------------------- +using MyRLE = TRLE; +// ------------------------------------------------------------------------------------- +double RLE::expectedCompressionRatio(DoubleStats &stats, u8 allowed_cascading_level) +{ + if ( stats.average_run_length < 2 ) { + return 0; + } + return DoubleScheme::expectedCompressionRatio(stats, allowed_cascading_level); +} +// ------------------------------------------------------------------------------------- +u32 RLE::compress(const DOUBLE *src, const BITMAP *nullmap, u8 *dest, DoubleStats &stats, u8 allowed_cascading_level) +{ + return MyRLE::compressColumn(src, nullmap, dest, stats, allowed_cascading_level, FLAGS_d_rle_force_values_scheme, FLAGS_d_rle_force_counts_scheme); +} +// ------------------------------------------------------------------------------------- +void RLE::decompress(DOUBLE *dest, BitmapWrapper *nullmap, const u8 *src, u32 tuple_count, u32 level) +{ + return MyRLE::decompressColumn(dest, nullmap, src, tuple_count, level); +} + +string RLE::fullDescription(const u8 *src) { + return MyRLE::fullDescription(src, this->selfDescription()); +} +// ------------------------------------------------------------------------------------- +} +} +} +} +// ------------------------------------------------------------------------------------- diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/double/RLE.hpp b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/double/RLE.hpp new file mode 100644 index 0000000..b4c8ab5 --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/double/RLE.hpp @@ -0,0 +1,31 @@ +#pragma once +#include "datablock/schemes/CScheme.hpp" +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +namespace cengine { +namespace db { +namespace v2 { +namespace d { +// ------------------------------------------------------------------------------------- +class RLE : public DoubleScheme { +public: + virtual double expectedCompressionRatio(DoubleStats &stats, u8 allowed_cascading_level) override; + u32 compress(const DOUBLE *src, const BITMAP *nullmap, u8 *dest, DoubleStats &stats, u8 allowed_cascading_level) override; + void decompress(DOUBLE *dest, BitmapWrapper *bitmap, const u8 *src, u32 tuple_count, u32 level) override; + std::string fullDescription(const u8 *src) override; + inline virtual DoubleSchemeType schemeType() override + { + return staticSchemeType(); + } + inline static DoubleSchemeType staticSchemeType() + { + return DoubleSchemeType::X_RLE; + } +}; +// ------------------------------------------------------------------------------------- +} +} +} +} +// ------------------------------------------------------------------------------------- diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/integer/DynamicDictionary.cpp b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/integer/DynamicDictionary.cpp new file mode 100644 index 0000000..1d86b7b --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/integer/DynamicDictionary.cpp @@ -0,0 +1,46 @@ +#include "Units.hpp" +#include "DynamicDictionary.hpp" +#include "datablock/schemes/v2/templated/DynamicDictionary.hpp" +#include "datablock/schemes/CScheme.hpp" +#include "datablock/schemes/CSchemePicker.hpp" +#include "storage/Chunk.hpp" +#include "utils/Utils.hpp" +// ------------------------------------------------------------------------------------- +#include "gflags/gflags.h" +#include "spdlog/spdlog.h" +// ------------------------------------------------------------------------------------- +#include +// ------------------------------------------------------------------------------------- +namespace cengine { +namespace db { +namespace v2 { +namespace integer { +using MyDynamicDictionary = TDynamicDictionary; +// ------------------------------------------------------------------------------------- +double DynamicDictionary::expectedCompressionRatio(SInteger32Stats &stats, u8 allowed_cascading_level) +{ + return MyDynamicDictionary::expectedCompressionRatio(stats, allowed_cascading_level); +} +// ------------------------------------------------------------------------------------- +u32 DynamicDictionary::compress(const INTEGER *src, const BITMAP *nullmap, u8 *dest, SInteger32Stats &stats, u8 allowed_cascading_level) +{ + return MyDynamicDictionary::compressColumn(src, nullmap, dest, stats, allowed_cascading_level); +} +// ------------------------------------------------------------------------------------- +void +DynamicDictionary::decompress(INTEGER *dest, BitmapWrapper *nullmap, const u8 *src, u32 tuple_count, u32 level) +{ + return MyDynamicDictionary::decompressColumn(dest, nullmap, src, tuple_count, level); +} +// ------------------------------------------------------------------------------------- +INTEGER DynamicDictionary::lookup(u32) { UNREACHABLE(); } +void DynamicDictionary::scan(Predicate, BITMAP *, const u8 *, u32) { UNREACHABLE(); } +string DynamicDictionary::fullDescription(const u8 *src) { + return MyDynamicDictionary::fullDescription(src, this->selfDescription()); +} +// ------------------------------------------------------------------------------------- +} +} +} +} +// ------------------------------------------------------------------------------------- diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/integer/DynamicDictionary.hpp b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/integer/DynamicDictionary.hpp new file mode 100644 index 0000000..15297a2 --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/integer/DynamicDictionary.hpp @@ -0,0 +1,39 @@ +#pragma once +#include "datablock/schemes/CScheme.hpp" +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +namespace cengine { +namespace db { +namespace v2 { +namespace integer { +// ------------------------------------------------------------------------------------- +struct DynamicDictionaryStructure { + u8 codes_scheme_code; + u32 codes_offset; + u8 data[]; +}; +// ------------------------------------------------------------------------------------- +class DynamicDictionary : public IntegerScheme { +public: + double expectedCompressionRatio(SInteger32Stats &stats, u8 allowed_cascading_level); + u32 compress(const INTEGER *src, const BITMAP *nullmap, u8 *dest, SInteger32Stats &stats, u8 allowed_cascading_level) override; + void decompress(INTEGER *dest, BitmapWrapper *nullmap, const u8 *src, u32 tuple_count, u32 level) override; + std::string fullDescription(const u8 *src) override; + inline virtual IntegerSchemeType schemeType() + { + return staticSchemeType(); + } + inline static IntegerSchemeType staticSchemeType() + { + return IntegerSchemeType::X_DICT; + } + INTEGER lookup(u32); + void scan(Predicate, BITMAP *, const u8 *, u32); +}; +// ------------------------------------------------------------------------------------- +} +} +} +} +// ------------------------------------------------------------------------------------- diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/integer/FOR.cpp b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/integer/FOR.cpp new file mode 100644 index 0000000..39b111a --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/integer/FOR.cpp @@ -0,0 +1,76 @@ +#include "Units.hpp" +#include "FOR.hpp" +#include "datablock/schemes/CScheme.hpp" +#include "datablock/schemes/CSchemePicker.hpp" +#include "storage/Chunk.hpp" +// ------------------------------------------------------------------------------------- +#include "gflags/gflags.h" +#include "spdlog/spdlog.h" +// ------------------------------------------------------------------------------------- +#include +// ------------------------------------------------------------------------------------- +namespace cengine { +namespace db { +namespace v2 { +namespace integer { +// ------------------------------------------------------------------------------------- +double FOR::expectedCompressionRatio(SInteger32Stats &stats, u8 allowed_cascading_level) +{ + return 0; +} +// ------------------------------------------------------------------------------------- +u32 FOR::compress(const INTEGER *src, const BITMAP *nullmap, u8 *dest, SInteger32Stats &stats, u8 allowed_cascading_level) +{ + auto &col_struct = *reinterpret_cast (dest); + vector biased_output; + // ------------------------------------------------------------------------------------- + col_struct.bias = stats.min; + for ( u32 row_i = 0; row_i < stats.tuple_count; row_i++ ) { + if ( nullmap == nullptr || nullmap[row_i] ) { + biased_output.push_back(src[row_i] - col_struct.bias); + } else { + biased_output.push_back(src[row_i]); + } + } + // ------------------------------------------------------------------------------------- + // Next Level compression + auto write_ptr = col_struct.data; + u32 used_space; + IntegerSchemePicker::compress(biased_output.data(), nullmap, write_ptr, biased_output.size(), allowed_cascading_level - 1, used_space, col_struct.next_scheme, AUTO_SCHEME, "for_next_level"); + write_ptr += used_space; + // ------------------------------------------------------------------------------------- + return write_ptr - dest; +} +// ------------------------------------------------------------------------------------- +void FOR::decompress(INTEGER *dest, BitmapWrapper *nullmap, const u8 *src, u32 tuple_count, u32 level) +{ + const auto &col_struct = *reinterpret_cast (src); + // ------------------------------------------------------------------------------------- + IntegerSchemePicker::MyTypeWrapper::getScheme(col_struct.next_scheme).decompress(dest, nullmap, col_struct.data, + tuple_count, level+1); + // ------------------------------------------------------------------------------------- + if (nullmap != nullptr && nullmap->type() == BitmapType::ALLZEROS) { + // Everything is null, no point in writing anything + return; + } + + // In any case we add the bias. The result for null values does not matter + for (u32 row_i = 0; row_i < tuple_count; row_i++) { + dest[row_i] += col_struct.bias; + } +} +// ------------------------------------------------------------------------------------- +INTEGER FOR::lookup(u32) { UNREACHABLE(); } +void FOR::scan(Predicate, BITMAP *, const u8 *, u32) { UNREACHABLE(); } + +std::string FOR::fullDescription(const u8 *src) { + const auto &col_struct = *reinterpret_cast (src); + auto &scheme = IntegerSchemePicker::MyTypeWrapper::getScheme(col_struct.next_scheme); + return this->selfDescription() + " -> ([int] biased) " + scheme.fullDescription(col_struct.data); +} +// ------------------------------------------------------------------------------------- +} +} +} +} +// ------------------------------------------------------------------------------------- diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/integer/FOR.hpp b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/integer/FOR.hpp new file mode 100644 index 0000000..1ebed0d --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/integer/FOR.hpp @@ -0,0 +1,39 @@ +#pragma once +#include "datablock/schemes/CScheme.hpp" +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +namespace cengine { +namespace db { +namespace v2 { +namespace integer { +// ------------------------------------------------------------------------------------- +struct FORStructure { + INTEGER bias; + u8 next_scheme; + u8 data[]; +}; +// ------------------------------------------------------------------------------------- +class FOR : public IntegerScheme { +public: + double expectedCompressionRatio(SInteger32Stats &stats, u8 allowed_cascading_level) override; + u32 compress(const INTEGER *src, const BITMAP *nullmap, u8 *dest, SInteger32Stats &stats, u8 allowed_cascading_level) override; + void decompress(INTEGER *dest, BitmapWrapper *nullmap, const u8 *src, u32 tuple_count, u32 level) override; + std::string fullDescription(const u8 *src) override; + inline virtual IntegerSchemeType schemeType() + { + return staticSchemeType(); + } + inline static IntegerSchemeType staticSchemeType() + { + return IntegerSchemeType::X_FOR; + } + INTEGER lookup(u32); + void scan(Predicate, BITMAP *, const u8 *, u32); +}; +// ------------------------------------------------------------------------------------- +} +} +} +} +// ------------------------------------------------------------------------------------- diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/integer/Frequency.cpp b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/integer/Frequency.cpp new file mode 100644 index 0000000..9dbd9bc --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/integer/Frequency.cpp @@ -0,0 +1,51 @@ +#include "Units.hpp" +#include "Frequency.hpp" +#include "datablock/schemes/v2/templated/Frequency.hpp" +#include "datablock/schemes/CScheme.hpp" +#include "datablock/schemes/CSchemePicker.hpp" +#include "storage/Chunk.hpp" +// ------------------------------------------------------------------------------------- +#include "gflags/gflags.h" +#include "spdlog/spdlog.h" +// ------------------------------------------------------------------------------------- +#include +// ------------------------------------------------------------------------------------- +DECLARE_uint32(frequency_threshold_pct); +// ------------------------------------------------------------------------------------- +namespace cengine { +namespace db { +namespace v2 { +namespace integer { +// ------------------------------------------------------------------------------------- +using MyFrequency = TFrequency; +// ------------------------------------------------------------------------------------- +double Frequency::expectedCompressionRatio(SInteger32Stats &stats, u8 allowed_cascading_level) +{ + if ( CD(stats.unique_count) * 100.0 / CD(stats.tuple_count) > FLAGS_frequency_threshold_pct ) { + return 0; + } + return IntegerScheme::expectedCompressionRatio(stats, allowed_cascading_level); +} +// ------------------------------------------------------------------------------------- +u32 Frequency::compress(const INTEGER *src, const BITMAP *nullmap, u8 *dest, SInteger32Stats &stats, u8 allowed_cascading_level) +{ + return MyFrequency::compressColumn(src, nullmap, dest, stats, allowed_cascading_level); +} +// ------------------------------------------------------------------------------------- +void Frequency::decompress(INTEGER *dest, BitmapWrapper *nullmap, const u8 *src, u32 tuple_count, u32 level) +{ + return MyFrequency::decompressColumn(dest, nullmap, src, tuple_count, level); +} +// ------------------------------------------------------------------------------------- +INTEGER Frequency::lookup(u32) { UNREACHABLE(); } +void Frequency::scan(Predicate, BITMAP *, const u8 *, u32) { UNREACHABLE(); } + +std::string Frequency::fullDescription(const u8 *src) { + return MyFrequency::fullDescription(src, this->selfDescription()); +} +// ------------------------------------------------------------------------------------- +} +} +} +} +// ------------------------------------------------------------------------------------- diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/integer/Frequency.hpp b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/integer/Frequency.hpp new file mode 100644 index 0000000..73eb83c --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/integer/Frequency.hpp @@ -0,0 +1,41 @@ +#pragma once +#include "datablock/schemes/CScheme.hpp" +// ------------------------------------------------------------------------------------- +#include "roaring/roaring.hh" +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +namespace cengine { +namespace db { +namespace v2 { +namespace integer { +// ------------------------------------------------------------------------------------- +struct FrequencyStructure { + UINTEGER top_value; + u32 exceptions_offset; + u8 next_scheme; + u8 data[]; +}; +// ------------------------------------------------------------------------------------- +class Frequency : public IntegerScheme { +public: + double expectedCompressionRatio(SInteger32Stats &stats, u8 allowed_cascading_level); + u32 compress(const INTEGER *src, const BITMAP *nullmap, u8 *dest, SInteger32Stats &stats, u8 allowed_cascading_level) override; + void decompress(INTEGER *dest, BitmapWrapper *nullmap, const u8 *src, u32 tuple_count, u32 level) override; + std::string fullDescription(const u8 *src) override; + inline virtual IntegerSchemeType schemeType() + { + return staticSchemeType(); + } + inline static IntegerSchemeType staticSchemeType() + { + return IntegerSchemeType::X_FREQUENCY; + } + INTEGER lookup(u32); + void scan(Predicate, BITMAP *, const u8 *, u32); +}; +// ------------------------------------------------------------------------------------- +} +} +} +} +// ------------------------------------------------------------------------------------- diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/integer/PBP.cpp b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/integer/PBP.cpp new file mode 100644 index 0000000..fd01098 --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/integer/PBP.cpp @@ -0,0 +1,209 @@ +#include "Units.hpp" +#include "PBP.hpp" +#include "datablock/schemes/CScheme.hpp" +#include "storage/Chunk.hpp" +#include "utils/Utils.hpp" +// ------------------------------------------------------------------------------------- +#include "gflags/gflags.h" +// ------------------------------------------------------------------------------------- +#include "extern/FastPFOR.hpp" +// ------------------------------------------------------------------------------------- +#include +// ------------------------------------------------------------------------------------- +DEFINE_bool(auto_fpb, true, ""); +namespace cengine { +namespace db { +namespace v2 { +namespace integer { +// ------------------------------------------------------------------------------------- +u32 PBP::compress(const INTEGER *src, const BITMAP *, u8 *dest, SInteger32Stats &stats, u8) +{ + auto &col_struct = *reinterpret_cast (dest); + // ------------------------------------------------------------------------------------- + FPFor fast_pfor; + size_t compressed_codes_size = stats.tuple_count + 1024; // not really used + // ------------------------------------------------------------------------------------- + auto dest_integer = reinterpret_cast(col_struct.data); + u64 padding; + dest_integer = Utils::alignBy(dest_integer, 16, padding); + col_struct.padding = padding; + auto dest_4_aligned = reinterpret_cast(dest_integer); + // ------------------------------------------------------------------------------------- + fast_pfor.compress(reinterpret_cast(src), stats.tuple_count, dest_4_aligned, compressed_codes_size); + col_struct.u32_count = compressed_codes_size; + // ------------------------------------------------------------------------------------- + return sizeof(XPBPStructure) + compressed_codes_size * sizeof(u32) + 16/*For padding */; +} +// ------------------------------------------------------------------------------------- +void PBP::decompress(INTEGER *dest, BitmapWrapper *, const u8 *src, u32 tuple_count, u32 level) +{ + auto &col_struct = *reinterpret_cast (src); + // ------------------------------------------------------------------------------------- + FPFor fast_pfor; + SIZE decompressed_codes_size = tuple_count; + auto encoded_array = const_cast(reinterpret_cast(col_struct.data + col_struct.padding)); + if ( fast_pfor.decompress(encoded_array, col_struct.u32_count, reinterpret_cast(dest), decompressed_codes_size) != encoded_array + col_struct.u32_count ) { + throw Generic_Exception("Decompressing XPBP failed"); + } + assert(decompressed_codes_size == tuple_count); +} +// ------------------------------------------------------------------------------------- +void PBP::scan(Predicate, BITMAP *, const u8 *, u32) { UNREACHABLE(); } +INTEGER PBP::lookup(u32) { UNREACHABLE(); } +// ------------------------------------------------------------------------------------- +// DELTA PBP +// ------------------------------------------------------------------------------------- +double PBP_DELTA::expectedCompressionRatio(SInteger32Stats &stats, u8 allowed_cascading_level) +{ + if(!stats.is_sorted){ + return 0; + } + return IntegerScheme::expectedCompressionRatio(stats, allowed_cascading_level); +} +// ------------------------------------------------------------------------------------- +u32 PBP_DELTA::compress(const INTEGER *src, const BITMAP *, u8 *dest, SInteger32Stats &stats, u8) +{ + // ------------------------------------------------------------------------------------- + auto &col_struct = *reinterpret_cast (dest); + auto src_biased = std::unique_ptr(new u32[stats.tuple_count]); + std::memcpy(src_biased.get(), src, stats.tuple_count * sizeof(u32)); + // TODO we may be able to get rid of this extra array and write directly to dest + FPFor::applyDelta(src_biased.get(), stats.tuple_count); + // ------------------------------------------------------------------------------------- + FPFor fast_pfor; + size_t compressed_codes_size = stats.tuple_count + 1024; // not really used + // ------------------------------------------------------------------------------------- + auto dest_integer = reinterpret_cast(col_struct.data); + u64 padding = dest_integer; + dest_integer = (dest_integer + 3) & ~3ul; + col_struct.padding = dest_integer - padding; + auto dest_4_aligned = reinterpret_cast(dest_integer); + // ------------------------------------------------------------------------------------- + fast_pfor.compress(reinterpret_cast(src_biased.get()), stats.tuple_count, dest_4_aligned, compressed_codes_size); + col_struct.u32_count = compressed_codes_size; + // ------------------------------------------------------------------------------------- + return sizeof(XPBPStructure) + compressed_codes_size * sizeof(u32); +} +// ------------------------------------------------------------------------------------- +void PBP_DELTA::decompress(INTEGER *dest, BitmapWrapper *, const u8 *src, u32 tuple_count, u32 level) +{ + auto &col_struct = *reinterpret_cast (src); + // ------------------------------------------------------------------------------------- + FPFor codec; + SIZE decompressed_codes_size = tuple_count; + auto encoded_array = const_cast(reinterpret_cast(col_struct.data + col_struct.padding)); + if ( codec.decompress(encoded_array, col_struct.u32_count, reinterpret_cast(dest), decompressed_codes_size) != encoded_array + col_struct.u32_count ) { + throw Generic_Exception("Decompressing XPBP Delta failed"); + } + assert(decompressed_codes_size == tuple_count); + FPFor::revertDelta(reinterpret_cast(dest), decompressed_codes_size); +} +// ------------------------------------------------------------------------------------- +void PBP_DELTA::scan(Predicate, BITMAP *, const u8 *, u32) { UNREACHABLE(); } +INTEGER PBP_DELTA::lookup(u32) { UNREACHABLE(); } +// ------------------------------------------------------------------------------------- +double FBP::expectedCompressionRatio(SInteger32Stats &stats, u8 allowed_cascading_level) { + + if(FLAGS_auto_fpb) { + return IntegerScheme::expectedCompressionRatio(stats, allowed_cascading_level); + } else { + return 0; + } +} +// ------------------------------------------------------------------------------------- +u32 FBP::compress(const INTEGER *src, const BITMAP *, u8 *dest, SInteger32Stats &stats, u8) +{ + auto &col_struct = *reinterpret_cast (dest); + // ------------------------------------------------------------------------------------- + FBPImpl fast_pfor; + size_t compressed_codes_size = stats.tuple_count + 1024; // not really used + // ------------------------------------------------------------------------------------- + auto dest_integer = reinterpret_cast(col_struct.data); + u64 padding = dest_integer; + dest_integer = (dest_integer + 3) & ~3ul; + col_struct.padding = dest_integer - padding; + auto dest_4_aligned = reinterpret_cast(dest_integer); + // ------------------------------------------------------------------------------------- + fast_pfor.compress(reinterpret_cast(src), stats.tuple_count, dest_4_aligned, compressed_codes_size); + col_struct.u32_count = compressed_codes_size; + // ------------------------------------------------------------------------------------- + return sizeof(XPBPStructure) + compressed_codes_size * sizeof(u32); +} +// ------------------------------------------------------------------------------------- +void FBP::decompress(INTEGER *dest, BitmapWrapper *, const u8 *src, u32 tuple_count, u32 level) +{ + auto &col_struct = *reinterpret_cast (src); + // ------------------------------------------------------------------------------------- + FBPImpl codec; + SIZE decompressed_codes_size = tuple_count; + auto encoded_array = const_cast(reinterpret_cast(col_struct.data + col_struct.padding)); + if ( codec.decompress(encoded_array, col_struct.u32_count, reinterpret_cast(dest), decompressed_codes_size) != encoded_array + col_struct.u32_count ) { + throw Generic_Exception("Decompressing XPBP failed"); + } + assert(decompressed_codes_size == tuple_count); +} +// ------------------------------------------------------------------------------------- +void FBP::scan(Predicate, BITMAP *, const u8 *, u32) { UNREACHABLE(); } +INTEGER FBP::lookup(u32) { UNREACHABLE(); } +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +double EXP_FBP::expectedCompressionRatio(SInteger32Stats &stats, u8 allowed_cascading_level) +{ + u32 b = Utils::getBitsNeeded(stats.max); + u32 integers_in_block = 64/b; + u32 bytes = 8 * std::ceil(stats.tuple_count / integers_in_block); + return 4 + bytes; +} +// ------------------------------------------------------------------------------------- +u32 EXP_FBP::compress(const INTEGER *src, const BITMAP *, u8 *dest, SInteger32Stats &stats, u8) +{ + u32 b = Utils::getBitsNeeded(stats.max); + u32 integers_in_block = 64/b; + u32 bytes = 8 * std::ceil(stats.tuple_count / integers_in_block); + return 4 + bytes; +} +// ------------------------------------------------------------------------------------- +void EXP_FBP::decompress(INTEGER *dest, BitmapWrapper *, const u8 *src, u32 tuple_count, u32 level) +{ + UNREACHABLE(); +} +// ------------------------------------------------------------------------------------- +void EXP_FBP::scan(Predicate, BITMAP *, const u8 *, u32) { UNREACHABLE(); } +INTEGER EXP_FBP::lookup(u32) { UNREACHABLE(); } +// ------------------------------------------------------------------------------------- +u32 FBP64::compress(u64 *src, u8 *dest, u32 tuple_count) +{ + auto &col_struct = *reinterpret_cast (dest); + // ------------------------------------------------------------------------------------- + FPFor fast_pfor; + size_t compressed_codes_size = tuple_count * 2 + 1024; // not really used + // ------------------------------------------------------------------------------------- + auto dest_integer = reinterpret_cast(col_struct.data); + u64 padding = dest_integer; + dest_integer = (dest_integer + 3) & ~3ul; + col_struct.padding = dest_integer - padding; + auto dest_4_aligned = reinterpret_cast(dest_integer); + // ------------------------------------------------------------------------------------- + fast_pfor.compress(reinterpret_cast(src), tuple_count, dest_4_aligned, compressed_codes_size); + col_struct.u32_count = compressed_codes_size; + // ------------------------------------------------------------------------------------- + return sizeof(XPBPStructure) + compressed_codes_size * sizeof(u32); +} +void FBP64::decompress(u8* dest, const u8 *src, u32 tuple_count, u32 level) +{ + auto &col_struct = *reinterpret_cast (src); + // ------------------------------------------------------------------------------------- + FPFor codec; + + SIZE decompressed_codes_size = tuple_count; + auto encoded_array = const_cast(reinterpret_cast(col_struct.data + col_struct.padding)); + if ( codec.decompress(encoded_array, col_struct.u32_count, reinterpret_cast(dest), decompressed_codes_size) != encoded_array + col_struct.u32_count ) { + throw Generic_Exception("Decompressing XPBP failed"); + } +} +// ------------------------------------------------------------------------------------- +} +} +} +} +// ------------------------------------------------------------------------------------- diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/integer/PBP.hpp b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/integer/PBP.hpp new file mode 100644 index 0000000..dddcd11 --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/integer/PBP.hpp @@ -0,0 +1,91 @@ +#pragma once +#include "datablock/schemes/CScheme.hpp" +// ------------------------------------------------------------------------------------- +namespace cengine { +namespace db { +namespace v2 { +namespace integer { +// ------------------------------------------------------------------------------------- +__attribute__((packed)) struct XPBPStructure { //need to be aligned by 4 because of FastPFor encodeArray + u32 u32_count; // number of 4 bytes written by FastPFor + u8 padding; + u8 data[]; +}; +// ------------------------------------------------------------------------------------- +class PBP : public IntegerScheme { +public: + u32 compress(const INTEGER *src, const BITMAP *nullmap, u8 *dest, SInteger32Stats &stats, u8 allowed_cascading_level) override; + void decompress(INTEGER *dest, BitmapWrapper *nullmap, const u8 *src, u32 tuple_count, u32 level) override; + inline virtual IntegerSchemeType schemeType() override + { + return staticSchemeType(); + } + inline static IntegerSchemeType staticSchemeType() + { + return IntegerSchemeType::X_PBP; + } + INTEGER lookup(u32); + void scan(Predicate, BITMAP *, const u8 *, u32); +}; +// ------------------------------------------------------------------------------------- +class PBP_DELTA : public IntegerScheme { +public: + double expectedCompressionRatio(SInteger32Stats &stats, u8 allowed_cascading_level); + u32 compress(const INTEGER *src, const BITMAP *nullmap, u8 *dest, SInteger32Stats &stats, u8 allowed_cascading_level) override; + void decompress(INTEGER *dest, BitmapWrapper *nullmap, const u8 *src, u32 tuple_count, u32 level) override; + inline virtual IntegerSchemeType schemeType() override + { + return staticSchemeType(); + } + inline static IntegerSchemeType staticSchemeType() + { + return IntegerSchemeType::X_PBP_DELTA; + } + INTEGER lookup(u32); + void scan(Predicate, BITMAP *, const u8 *, u32); +}; +// ------------------------------------------------------------------------------------- +class FBP : public IntegerScheme { +public: + double expectedCompressionRatio(SInteger32Stats &stats, u8 allowed_cascading_level); + u32 compress(const INTEGER *src, const BITMAP *nullmap, u8 *dest, SInteger32Stats &stats, u8 allowed_cascading_level) override; + void decompress(INTEGER *dest, BitmapWrapper *nullmap, const u8 *src, u32 tuple_count, u32 level) override; + inline virtual IntegerSchemeType schemeType() override + { + return staticSchemeType(); + } + inline static IntegerSchemeType staticSchemeType() + { + return IntegerSchemeType::X_FBP; + } + INTEGER lookup(u32) override; + void scan(Predicate, BITMAP *, const u8 *, u32) override; +}; +// ------------------------------------------------------------------------------------- +class EXP_FBP : public IntegerScheme { +public: + double expectedCompressionRatio(SInteger32Stats &stats, u8 allowed_cascading_level); + u32 compress(const INTEGER *src, const BITMAP *nullmap, u8 *dest, SInteger32Stats &stats, u8 allowed_cascading_level) override; + void decompress(INTEGER *dest, BitmapWrapper *nullmap, const u8 *src, u32 tuple_count, u32 level) override; + inline virtual IntegerSchemeType schemeType() override + { + return staticSchemeType(); + } + inline static IntegerSchemeType staticSchemeType() + { + return IntegerSchemeType::X_FBP; + } + INTEGER lookup(u32) override; + void scan(Predicate, BITMAP *, const u8 *, u32) override; +}; +// ------------------------------------------------------------------------------------- + class FBP64 { + public: + static u32 compress(u64 *src, u8 *dest, u32 tuple_count); + static void decompress(u8* dest, const u8 *src, u32 tuple_count, u32 level); + }; +} +} +} +} +// ------------------------------------------------------------------------------------- diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/integer/RLE.cpp b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/integer/RLE.cpp new file mode 100644 index 0000000..63e7a16 --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/integer/RLE.cpp @@ -0,0 +1,54 @@ +#include "Units.hpp" +#include "RLE.hpp" +#include "datablock/schemes/v2/templated/RLE.hpp" +#include "datablock/schemes/CScheme.hpp" +#include "datablock/schemes/CSchemePool.hpp" +// ------------------------------------------------------------------------------------- +#include "gflags/gflags.h" +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +namespace cengine { +namespace db { +namespace v2 { +namespace integer { +// ------------------------------------------------------------------------------------- +DEFINE_uint32(rle_threshold, 2, ""); +DEFINE_uint32(i_rle_force_values_scheme, AUTO_SCHEME, ""); +DEFINE_uint32(i_rle_force_counts_scheme, AUTO_SCHEME, ""); +// ------------------------------------------------------------------------------------- +using MyRLE = TRLE; +// ------------------------------------------------------------------------------------- +double RLE::expectedCompressionRatio(SInteger32Stats &stats, u8 allowed_cascading_level) +{ + if ( stats.average_run_length < 2 ) { + return 0; + } + return IntegerScheme::expectedCompressionRatio(stats, allowed_cascading_level); +} +// ------------------------------------------------------------------------------------- +u32 RLE::compress(const INTEGER *src, const BITMAP *nullmap, u8 *dest, SInteger32Stats &stats, u8 allowed_cascading_level) +{ + return MyRLE::compressColumn(src, nullmap, dest, stats, allowed_cascading_level, FLAGS_i_rle_force_values_scheme, FLAGS_i_rle_force_counts_scheme); +} +// ------------------------------------------------------------------------------------- +void RLE::decompress(INTEGER *dest, BitmapWrapper *nullmap, const u8 *src, u32 tuple_count, u32 level) +{ + return MyRLE::decompressColumn(dest, nullmap, src, tuple_count, level); +} +u32 RLE::decompressRuns(INTEGER *values, INTEGER *counts, BitmapWrapper *nullmap, const u8 *src, u32 tuple_count, + u32 level) { + return MyRLE::decompressRuns(values, counts, nullmap, src, tuple_count, level); +} +// ------------------------------------------------------------------------------------- +INTEGER RLE::lookup(u32) { UNREACHABLE(); } +void RLE::scan(Predicate, BITMAP *, const u8 *, u32) { UNREACHABLE(); } + +std::string RLE::fullDescription(const u8 *src) { + return MyRLE::fullDescription(src, this->selfDescription()); +} +// ------------------------------------------------------------------------------------- +} +} +} +} +// ------------------------------------------------------------------------------------- diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/integer/RLE.hpp b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/integer/RLE.hpp new file mode 100644 index 0000000..40048c4 --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/integer/RLE.hpp @@ -0,0 +1,42 @@ +#pragma once +#include "datablock/schemes/CScheme.hpp" +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +namespace cengine { +namespace db { +namespace v2 { +namespace integer { +// ------------------------------------------------------------------------------------- +struct RLEStructure { + u32 runs_count; + u32 runs_count_offset; + u8 values_scheme_code; + u8 counts_scheme_code; + u8 data[]; +}; +// ------------------------------------------------------------------------------------- +class RLE : public IntegerScheme { +public: + virtual double expectedCompressionRatio(SInteger32Stats &stats, u8 allowed_cascading_level) override; + u32 compress(const INTEGER *src, const BITMAP *nullmap, u8 *dest, SInteger32Stats &stats, u8 allowed_cascading_level) override; + u32 decompressRuns(INTEGER *values, INTEGER *counts, BitmapWrapper *nullmap, const u8 *src, u32 tuple_count, u32 level); + void decompress(INTEGER *dest, BitmapWrapper *nullmap, const u8 *src, u32 tuple_count, u32 level) override; + std::string fullDescription(const u8 *src) override; + inline virtual IntegerSchemeType schemeType() override + { + return staticSchemeType(); + } + inline static IntegerSchemeType staticSchemeType() + { + return IntegerSchemeType::X_RLE; + } + INTEGER lookup(u32); + void scan(Predicate, BITMAP *, const u8 *, u32); +}; +// ------------------------------------------------------------------------------------- +} +} +} +} +// ------------------------------------------------------------------------------------- diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/string/DynamicDictionary.cpp b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/string/DynamicDictionary.cpp new file mode 100644 index 0000000..ee6d49b --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/string/DynamicDictionary.cpp @@ -0,0 +1,572 @@ +#include "Units.hpp" +#include "DynamicDictionary.hpp" +#include "storage/Chunk.hpp" +#include "datablock/schemes/v2/integer/PBP.hpp" +#include "datablock/schemes/v2/integer/RLE.hpp" +#include "datablock/schemes/CSchemePicker.hpp" +// ------------------------------------------------------------------------------------- +#include "gflags/gflags.h" +#include "spdlog/spdlog.h" +#include "fsst.h" +#include "storage/StringPointerArrayViewer.hpp" +// ------------------------------------------------------------------------------------- +#include +// ------------------------------------------------------------------------------------- +DECLARE_string(fsst_stats); +namespace cengine { +namespace db { +namespace v2 { +namespace string { +// ------------------------------------------------------------------------------------- +DEFINE_bool(string_dictionary_use_fsst, true, ""); +DEFINE_uint32(fsst_force_codes_scheme, AUTO_SCHEME, ""); +DEFINE_uint32(fsst_input_size_threshold, FSST_THRESHOLD, ""); +DEFINE_uint32(fsst_codes_max_cascading_level, 2, ""); +// ------------------------------------------------------------------------------------- +/* + * Plan: + * Output only 32-bits codes, hence no need for templates + * compress distinct strings with fsst + */ +// ------------------------------------------------------------------------------------- +double DynamicDictionary::expectedCompressionRatio(StringStats &stats, u8) +{ + u32 bits_per_code = std::floor(std::log2(stats.distinct_values.size())) + 1; + u32 after_size = sizeof(DynamicDictionaryStructure); + after_size += FSST_MAXHEADER; + after_size += stats.tuple_count * (CD(bits_per_code) / 8.0); + after_size += sizeof(StringArrayViewer::Slot) * (1 + stats.distinct_values.size()); + if ( stats.total_unique_length >= FLAGS_fsst_input_size_threshold ) { // Threshold + after_size += 0.5 * static_cast(stats.total_unique_length); // 0.5 for fsst + } else { + after_size += static_cast(stats.total_unique_length); + } + return CD(stats.total_size) / CD(after_size); +} +// ------------------------------------------------------------------------------------- +bool DynamicDictionary::usesFsst(const u8* src) +{ + const auto &col_struct = *reinterpret_cast (src); + return col_struct.use_fsst; +} +// ------------------------------------------------------------------------------------- +u32 DynamicDictionary::compress(const cengine::StringArrayViewer src, const BITMAP *, u8 *dest, cengine::db::StringStats &stats) +{ + // Layout: FSST_DICT | FSST_STRINGS | FSST_OFFSETS@from FSST_STRINGS BEGINNING@ | Compressed(Codes) + // OR : DICT | OFFSETS | Compressed(Codes) + // ------------------------------------------------------------------------------------- + die_if(stats.unique_count <= std::numeric_limits::max()); + // ------------------------------------------------------------------------------------- + auto &col_struct = *reinterpret_cast (dest); + col_struct.use_fsst = FLAGS_string_dictionary_use_fsst && (stats.total_unique_length >= FLAGS_fsst_input_size_threshold); + if ( FLAGS_fsst_stats != "" ) { + col_struct.use_fsst = true; // we are experimeneting with FSST + } + col_struct.total_decompressed_size = stats.total_size; + col_struct.num_codes = stats.distinct_values.size(); + auto write_ptr = col_struct.data; + // IDEA: sort distinct_values ascending by number of occurences to reduce the numbers of bits required for codes + vector distinct_values(stats.distinct_values.begin(), stats.distinct_values.end()); + // ------------------------------------------------------------------------------------- + // FSST Compression + // ------------------------------------------------------------------------------------- + spdlog::debug("FSST: using_fsst = {}", static_cast(col_struct.use_fsst)); + // ------------------------------------------------------------------------------------- + u64 fsst_strings_used_space; + if ( col_struct.use_fsst ) { + ThreadCache::setFsst(); + // ------------------------------------------------------------------------------------- + const auto fsst_n = stats.unique_count; + auto input_string_buffers = std::unique_ptr(new u8 *[fsst_n]); + auto input_string_lengths = std::unique_ptr(new u64[fsst_n]); + auto output_string_buffers = std::unique_ptr(new u8 *[fsst_n]); + auto output_string_lengths = std::unique_ptr(new u64[fsst_n]); + // ------------------------------------------------------------------------------------- + u32 str_i = 0; + for ( const auto &distinct_str: distinct_values ) { + input_string_buffers[str_i] = const_cast(reinterpret_cast(distinct_str.data())); + input_string_lengths[str_i] = distinct_str.size(); + str_i++; + } + // ------------------------------------------------------------------------------------- + // Encoder + //TODO: use memcpy instead of export/import (Note: I still use FSST_MAXHEADER ~2KiB ) + fsst_encoder_t *encoder = fsst_create(fsst_n, input_string_lengths.get(), input_string_buffers.get(), 0); + die_if(fsst_export(encoder, write_ptr) > 0); + auto fsst_table_used_space = FSST_MAXHEADER; + // ------------------------------------------------------------------------------------- + spdlog::debug("FSST: dict_s = {}", static_cast(fsst_table_used_space)); + // ------------------------------------------------------------------------------------- + write_ptr += fsst_table_used_space; + // ------------------------------------------------------------------------------------- + // Compress + const u64 output_buffer_size = 7 + 4 * stats.total_unique_length; // fake + if ( fsst_compress(encoder, fsst_n, input_string_lengths.get(), input_string_buffers.get(), + output_buffer_size, write_ptr, + output_string_lengths.get(), output_string_buffers.get()) != fsst_n ) { + throw Generic_Exception("FSST Compression failed !"); + } + fsst_strings_used_space = output_string_lengths[fsst_n - 1] + (output_string_buffers[fsst_n - 1] - output_string_buffers[0]); + // ------------------------------------------------------------------------------------- + spdlog::debug("FSST: strings_s = {}", static_cast(fsst_strings_used_space)); + // ------------------------------------------------------------------------------------- + spdlog::debug("FSST: string_pool_size : before = {} after = {}", static_cast(stats.total_unique_length), static_cast(fsst_table_used_space + fsst_strings_used_space)); + // ------------------------------------------------------------------------------------- + write_ptr += fsst_strings_used_space; + // ------------------------------------------------------------------------------------- + // Convert destLen to offsets + col_struct.fsst_offsets_offset = write_ptr - col_struct.data; + auto fsst_offsets = reinterpret_cast(write_ptr); + u32 last_offset = 0; + for ( u32 row_i = 0; row_i < fsst_n; row_i++ ) { + fsst_offsets[row_i] = last_offset; + last_offset += output_string_lengths[row_i]; + } + fsst_offsets[fsst_n] = last_offset; + write_ptr += (fsst_n + 1) * sizeof(u32); + // ------------------------------------------------------------------------------------- + // Expiremental part; check if all offsets are equal TODO + // auto fsst_length_stats = NumberStats::generateStats(const_cast(output_string_lengths.get()), nullptr, fsst_n); + // u32 min_length = std::numeric_limits::max(), max_length= 0; + // for ( u32 row_i = 1; row_i < fsst_n; row_i++ ) { + // if(output_string_lengths[row_i] < min_length) { + // min_length = output_string_lengths[row_i]; + // } else if(output_string_lengths[row_i] > max_length) { + // max_length = output_string_lengths[row_i]; + // } + // } + // cout <<"v -> c " << endl; + // for(const auto &t : fsst_length_stats.distinct_values) { + // cout << t.first << '\t' << t.second << endl; + // } + std::vector uncompressed_lengths; + for (u32 i = 0; i < fsst_n; i++) { + uncompressed_lengths.push_back(static_cast(input_string_lengths[i])); + } + col_struct.lengths_offset = write_ptr - col_struct.data; + u32 used_space; + IntegerSchemePicker::compress(uncompressed_lengths.data(), nullptr, write_ptr, uncompressed_lengths.size(), FLAGS_fsst_codes_max_cascading_level, used_space, col_struct.lengths_scheme, + static_cast(AUTO_SCHEME)); + write_ptr += used_space; + } else { + auto dest_slot_ptr = reinterpret_cast(col_struct.data); + u8 *str_write_ptr = col_struct.data + ((distinct_values.size() + 1) * sizeof(StringArrayViewer::Slot)); + for ( const auto &distinct_str: distinct_values ) { + dest_slot_ptr++->offset = str_write_ptr - col_struct.data; // Note, string offset is relative to the first slot + std::memcpy(str_write_ptr, distinct_str.data(), distinct_str.length()); + str_write_ptr += distinct_str.length(); + } + dest_slot_ptr->offset = str_write_ptr - col_struct.data; + write_ptr = str_write_ptr; + } + // ------------------------------------------------------------------------------------- + // Codes + { + s32 run_count = 0; + s32 current_code = -1; + vector codes; + for ( u32 row_i = 0; row_i < stats.tuple_count; row_i++ ) { + const str ¤t_value = src(row_i); + auto it = std::lower_bound(distinct_values.begin(), distinct_values.end(), current_value); + assert(it != distinct_values.end()); + auto new_code = static_cast(std::distance(distinct_values.begin(), it)); + codes.push_back(new_code); + if (new_code != current_code) { + run_count++; + current_code = new_code; + } + } + double avg_run_length = static_cast(stats.tuple_count) / static_cast(run_count); + auto forced_scheme = static_cast(FLAGS_fsst_force_codes_scheme); + col_struct.use_rle_optimized_path = false; + if (avg_run_length > 3.0) { + col_struct.use_rle_optimized_path = true; + forced_scheme = IntegerSchemeType::X_RLE; + } + // ------------------------------------------------------------------------------------- + // Compress codes + col_struct.codes_offset = write_ptr - col_struct.data; + u32 used_space; + IntegerSchemePicker::compress(codes.data(), nullptr, write_ptr, codes.size(), FLAGS_fsst_codes_max_cascading_level, used_space, col_struct.codes_scheme, + static_cast(forced_scheme)); + write_ptr += used_space; + // ------------------------------------------------------------------------------------- + spdlog::debug("FSST: codes_c = {} codes_s = {}", static_cast(col_struct.codes_scheme), static_cast(used_space)); + // ------------------------------------------------------------------------------------- + } + // ------------------------------------------------------------------------------------- + u32 after_size = write_ptr - dest; + ThreadCache::dumpFsst(stats.total_length, stats.total_unique_length, fsst_strings_used_space + FSST_MAXHEADER, + after_size); + return after_size; +} +// ------------------------------------------------------------------------------------- +u32 DynamicDictionary::getDecompressedSize(const u8 *src, u32 tuple_count, BitmapWrapper *nullmap) +{ + return reinterpret_cast (src)->total_decompressed_size; +} +// ------------------------------------------------------------------------------------- +u32 DynamicDictionary::getDecompressedSizeNoCopy(const u8 *src, u32 tuple_count, BitmapWrapper *nullmap) { + const auto &col_struct = *reinterpret_cast (src); + // total_decompressed_size = slots_size + strings_size + // We need to get rid of the (tuple_count + 1) slots + auto strings_size = col_struct.total_decompressed_size - ((tuple_count + 1) * sizeof(StringArrayViewer::Slot)); + auto views_size = (tuple_count+4) * sizeof(StringPointerArrayViewer::View); + return strings_size + views_size; +} +// ------------------------------------------------------------------------------------- +void DynamicDictionary::decompress(u8 *dest, BitmapWrapper *, const u8 *src, u32 tuple_count, u32 level) +{ + // ------------------------------------------------------------------------------------- + const auto &col_struct = *reinterpret_cast (src); + const u8 *compressed_codes_ptr = col_struct.data + col_struct.codes_offset; + auto dest_slots = reinterpret_cast(dest); + u32 start = (sizeof(StringArrayViewer::Slot) * (tuple_count + 1)) + SIMD_EXTRA_BYTES; + auto dest_write_ptr = dest + start; + // ------------------------------------------------------------------------------------- + // Decode codes + if (col_struct.use_rle_optimized_path) { + IntegerScheme &codes_scheme = IntegerSchemePicker::MyTypeWrapper::getScheme(col_struct.codes_scheme); + auto &rle = dynamic_cast(codes_scheme); + + thread_local std::vector> values_v; + auto values_ptr = get_level_data(values_v, tuple_count + SIMD_EXTRA_ELEMENTS(INTEGER), level); + + thread_local std::vector> counts_v; + auto counts_ptr = get_level_data(counts_v, tuple_count + SIMD_EXTRA_ELEMENTS(INTEGER), level); + + u32 runs_count = rle.decompressRuns(values_ptr, counts_ptr, nullptr, compressed_codes_ptr, tuple_count, level+1); + + auto offsets_ptr = reinterpret_cast(dest_slots); + + // IDEA: + // Making all the cache arrays a single array (either u64 or some tuple) could improve cache locality since + // the caches are always accessed at the same index all the time. + thread_local std::vector> cached_strings_v; + auto cached_strings_ptr = get_level_data(cached_strings_v, col_struct.num_codes, level); + // There might be old data in here, 0 it out + std::fill_n(cached_strings_v[level].begin(), col_struct.num_codes, nullptr); + thread_local std::vector> cached_run_lengths_v; + auto cached_run_lengths_ptr = get_level_data(cached_run_lengths_v, col_struct.num_codes, level); + + thread_local std::vector> cached_str_lengths_v; + auto cached_str_lengths_ptr = get_level_data(cached_str_lengths_v, col_struct.num_codes, level); + + if (!col_struct.use_fsst) { + StringArrayViewer dict_array(col_struct.data); + + // Fill str_length cache + for (u32 idx = 0; idx < col_struct.num_codes; idx++) { + cached_str_lengths_ptr[idx] = dict_array.size(idx); + } + + // writeOffsetsU32 may write extra elements which in turn may overwrite strings if not used in correct order. + // Therefore we write all the offsets first and then write the strings. + for (u32 run = 0; run < runs_count; run++) { + u32 code = values_ptr[run]; + u32 run_length = counts_ptr[run]; + u32 str_length = cached_str_lengths_ptr[code]; + Utils::writeOffsetsU32(offsets_ptr, start, str_length, run_length); + offsets_ptr += run_length; + start += run_length * str_length; + } + *offsets_ptr = start; + + for (u32 run = 0; run < runs_count; run++) { + u32 code = values_ptr[run]; + u32 run_length = counts_ptr[run]; + u32 str_length = cached_str_lengths_ptr[code]; + const char *str_src; + u32 src_length; + if (cached_strings_ptr[code] == nullptr) { + str_src = dict_array.get_pointer(code); + src_length = 1; + cached_strings_ptr[code] = reinterpret_cast(dest_write_ptr); + cached_run_lengths_ptr[code] = run_length; + } else { + str_src = cached_strings_ptr[code]; + src_length = cached_run_lengths_ptr[code]; + if (run_length > src_length) { + cached_strings_ptr[code] = reinterpret_cast(dest_write_ptr); + cached_run_lengths_ptr[code] = run_length; + } + } + Utils::multiplyString(reinterpret_cast(dest_write_ptr), str_src, str_length, run_length, src_length); + dest_write_ptr += run_length * str_length; + } + } else { + fsst_decoder_t decoder; + const u32 fsst_dict_size = FSST_MAXHEADER; + die_if(fsst_import(&decoder, const_cast(col_struct.data)) > 0); + auto fsst_offsets = reinterpret_cast(col_struct.data + col_struct.fsst_offsets_offset ); + auto fsst_compressed_buf = col_struct.data + fsst_dict_size; + + /* + * For fsst we do not know the string lengths before decompression, so we cannot write the offsets upfront. + */ + for (u32 run = 0; run < runs_count; run++) { + u32 code = values_ptr[run]; + u32 run_length = counts_ptr[run]; + u32 run_length_left = run_length; + const char *str_src; + u32 src_length; + u32 str_length; + if (cached_strings_ptr[code] == nullptr) { + // We need to decompress the string + auto compressed_str_length = fsst_offsets[code + 1] - fsst_offsets[code]; + auto compressed_str_ptr = fsst_compressed_buf + fsst_offsets[code]; + str_length = fsst_decompress(&decoder, compressed_str_length, const_cast(compressed_str_ptr), + MAX_STR_LENGTH, dest_write_ptr); + + // Write cache + cached_strings_ptr[code] = reinterpret_cast(dest_write_ptr); + cached_str_lengths_ptr[code] = str_length; + // We will have written run_length strings in the end, but for now have only decompressed a single one. + cached_run_lengths_ptr[code] = run_length; + + src_length = 1; + str_src = reinterpret_cast(dest_write_ptr); + run_length_left--; + + // Advance write pointer for utils functions + dest_write_ptr += str_length; + } else { + // String has already been decompressed + + // Get values from cache + str_length = cached_str_lengths_ptr[code]; + str_src = cached_strings_ptr[code]; + src_length = cached_run_lengths_ptr[code]; + + // Update cache if necessary + if (run_length > src_length) { + cached_strings_ptr[code] = reinterpret_cast(dest_write_ptr); + cached_run_lengths_ptr[code] = run_length; + } + } + + // Write offsets + Utils::writeOffsetsU32(offsets_ptr, start, str_length, run_length); + offsets_ptr += run_length; + start += run_length * str_length; + + // Write strings + Utils::multiplyString(reinterpret_cast(dest_write_ptr), str_src, str_length, run_length_left, src_length); + dest_write_ptr += run_length_left * str_length; + } + *offsets_ptr = start; + } + } else { + thread_local std::vector> decompressed_codes_v; + auto decompressed_codes = get_level_data(decompressed_codes_v, tuple_count + SIMD_EXTRA_ELEMENTS(INTEGER), + level); + IntegerScheme &codes_scheme = IntegerSchemePicker::MyTypeWrapper::getScheme(col_struct.codes_scheme); + codes_scheme.decompress(decompressed_codes, nullptr, compressed_codes_ptr, tuple_count, level + 1); + // ------------------------------------------------------------------------------------- + if (col_struct.use_fsst) { + fsst_decoder_t decoder; + const u32 fsst_dict_size = FSST_MAXHEADER; + die_if(fsst_import(&decoder, const_cast(col_struct.data)) > 0); + auto fsst_offsets = reinterpret_cast(col_struct.data + col_struct.fsst_offsets_offset ); + auto fsst_compressed_buf = col_struct.data + fsst_dict_size; + for (u32 row_i = 0; row_i < tuple_count; row_i++) { + dest_slots[row_i].offset = dest_write_ptr - dest; + auto code = decompressed_codes[row_i]; + auto compressed_str_length = fsst_offsets[code + 1] - fsst_offsets[code]; + auto compressed_str_ptr = fsst_compressed_buf + fsst_offsets[code]; + dest_write_ptr += fsst_decompress(&decoder, compressed_str_length, const_cast(compressed_str_ptr), + MAX_STR_LENGTH, dest_write_ptr); + } + } else { + StringArrayViewer dict_array(col_struct.data); + for (u32 row_i = 0; row_i < tuple_count; row_i++) { + dest_slots[row_i].offset = dest_write_ptr - dest; + auto current_code = decompressed_codes[row_i]; + u32 length = dict_array.size(current_code); + const char *string = dict_array.get_pointer(current_code); + std::memcpy(dest_write_ptr, string, length); + dest_write_ptr += length; + } + } + dest_slots[tuple_count].offset = dest_write_ptr - dest; + die_if(dest_write_ptr - dest - SIMD_EXTRA_BYTES == col_struct.total_decompressed_size); + } +} + +bool DynamicDictionary::decompressNoCopy(u8 *dest, BitmapWrapper *, const u8 *src, u32 tuple_count, + u32 level) { + const auto &col_struct = *reinterpret_cast (src); + + // Build views + thread_local std::vector> views_v; + auto views_ptr = get_level_data(views_v, col_struct.num_codes, level); + + auto dest_views = reinterpret_cast(dest); + auto current_offset = (tuple_count+4) * sizeof(StringPointerArrayViewer::View); + + // Ideas for better performance + // - save dictionary strings as StringPointerArrayView straight away + // - decompress all fsst_strings at once and save sting lengths separately + // - optimize the loop at the with avx gather or cache miss hiding through vectorization + + + // Copy strings to destination + if (col_struct.use_fsst) { + // Decompress lengths + thread_local std::vector> uncompressed_lengths_v; + auto uncompressed_lengths_ptr = get_level_data(uncompressed_lengths_v, col_struct.num_codes + SIMD_EXTRA_ELEMENTS(INTEGER), level); + IntegerScheme &lengths_scheme = IntegerSchemePicker::MyTypeWrapper::getScheme(col_struct.lengths_scheme); + lengths_scheme.decompress(uncompressed_lengths_ptr, nullptr, col_struct.data + col_struct.lengths_offset, col_struct.num_codes, level + 1); + + // Fill lengths and offsets + u32 start_offset = current_offset; + for (u32 c = 0; c < col_struct.num_codes; c++) { + views_ptr[c].offset = current_offset; + views_ptr[c].length = uncompressed_lengths_ptr[c]; + current_offset += uncompressed_lengths_ptr[c]; + } + u32 total_length = current_offset - start_offset; + + // Decompress all strings in one go + fsst_decoder_t decoder; + const u32 fsst_dict_size = FSST_MAXHEADER; + die_if(fsst_import(&decoder, const_cast(col_struct.data)) > 0); + auto fsst_compressed_buf = col_struct.data + fsst_dict_size; + u32 total_compressed_length = col_struct.fsst_offsets_offset - fsst_dict_size; + fsst_decompress(&decoder, total_compressed_length, const_cast(fsst_compressed_buf), + total_length + 4096, reinterpret_cast(dest) + start_offset); + } else { + auto start_offset = current_offset; + StringArrayViewer dict_array(col_struct.data); + for (u32 c = 0; c < col_struct.num_codes; c++) { + views_ptr[c].length = dict_array.size(c); + views_ptr[c].offset = current_offset; + current_offset += views_ptr[c].length; + } + + std::memcpy(reinterpret_cast(dest) + start_offset, dict_array.get_pointer(0), current_offset - start_offset); + } + + const u8 *compressed_codes_ptr = col_struct.data + col_struct.codes_offset; + if (col_struct.use_rle_optimized_path) { + IntegerScheme &codes_scheme = IntegerSchemePicker::MyTypeWrapper::getScheme(col_struct.codes_scheme); + auto &rle = dynamic_cast(codes_scheme); + + thread_local std::vector> values_v; + auto values_ptr = get_level_data(values_v, tuple_count + SIMD_EXTRA_ELEMENTS(INTEGER), level); + + thread_local std::vector> counts_v; + auto counts_ptr = get_level_data(counts_v, tuple_count + SIMD_EXTRA_ELEMENTS(INTEGER), level); + + u32 runs_count = rle.decompressRuns(values_ptr, counts_ptr, nullptr, compressed_codes_ptr, tuple_count, level + 1); + + static_assert(sizeof(StringPointerArrayViewer::View) == 8); +#ifdef BTR_USE_SIMD + for (u32 run = 0; run < runs_count; run++) { + INTEGER code = values_ptr[run]; + auto *data = reinterpret_cast(views_ptr + code); + __m256i data_v = _mm256_set1_epi64x(*data); + INTEGER run_length = counts_ptr[run]; + auto dest_view_simd = reinterpret_cast<__m256i *>(dest_views); + for (INTEGER repeat = 0; repeat < run_length; repeat += 4) { + _mm256_storeu_si256(dest_view_simd, data_v); + dest_view_simd++; + } + dest_views += run_length; + } +#else + for (u32 run = 0; run < runs_count; run++) { + INTEGER code = values_ptr[run]; + for (INTEGER repeat = 0; repeat < counts_ptr[run]; ++repeat) { + *dest_views++ = views_ptr[code]; + } + } +#endif + } else { + // Decompress codes + thread_local std::vector> decompressed_codes_v; + auto decompressed_codes = get_level_data(decompressed_codes_v, tuple_count + SIMD_EXTRA_ELEMENTS(INTEGER), + level); + IntegerScheme &codes_scheme = IntegerSchemePicker::MyTypeWrapper::getScheme(col_struct.codes_scheme); + codes_scheme.decompress(decompressed_codes, nullptr, compressed_codes_ptr, tuple_count, level + 1); + + u32 row_i = 0; +#ifdef BTR_USE_SIMD + static_assert(sizeof(*views_ptr) == 8); + static_assert(SIMD_EXTRA_BYTES >= 4 * sizeof(__m256i)); + if (tuple_count >= 16) { + while (row_i < tuple_count-15) { + // We cannot write out of bounds here like we do for other dict implementations + // because it would destroy the string data. + + // Load codes. + __m128i codes_0 = _mm_loadu_si128(reinterpret_cast<__m128i *>(decompressed_codes + 0)); + __m128i codes_1 = _mm_loadu_si128(reinterpret_cast<__m128i *>(decompressed_codes + 4)); + __m128i codes_2 = _mm_loadu_si128(reinterpret_cast<__m128i *>(decompressed_codes + 8)); + __m128i codes_3 = _mm_loadu_si128(reinterpret_cast<__m128i *>(decompressed_codes + 12)); + + // Gather values. + __m256i values_0 = _mm256_i32gather_epi64(reinterpret_cast(views_ptr), codes_0, 8); + __m256i values_1 = _mm256_i32gather_epi64(reinterpret_cast(views_ptr), codes_1, 8); + __m256i values_2 = _mm256_i32gather_epi64(reinterpret_cast(views_ptr), codes_2, 8); + __m256i values_3 = _mm256_i32gather_epi64(reinterpret_cast(views_ptr), codes_3, 8); + + // Store values. + _mm256_storeu_si256(reinterpret_cast<__m256i *>(dest_views + 0), values_0); + _mm256_storeu_si256(reinterpret_cast<__m256i *>(dest_views + 4), values_1); + _mm256_storeu_si256(reinterpret_cast<__m256i *>(dest_views + 8), values_2); + _mm256_storeu_si256(reinterpret_cast<__m256i *>(dest_views + 12), values_3); + + decompressed_codes += 16; + dest_views += 16; + row_i += 16; + } + } +#endif + + // Write remaining values (up to 15) + while(row_i < tuple_count) { + *dest_views++ = views_ptr[*decompressed_codes++]; + row_i++; + } + } + + return true; +} + +std::string DynamicDictionary::fullDescription(const u8 *src) { + const auto &col_struct = *reinterpret_cast (src); + IntegerScheme &codes_scheme = IntegerSchemePicker::MyTypeWrapper::getScheme(col_struct.codes_scheme); + return this->selfDescription(src) + " -> ([int] codes) " + codes_scheme.fullDescription(col_struct.data + col_struct.codes_offset); +} + +bool DynamicDictionary::isUsable(StringStats &stats) { + // This is just a hacky way around, so we can properly test the improvement from not having raw fsst to having the raw fsst scheme. + bool has_raw_fsst = false; + for (const auto &scheme: CSchemePool::available_schemes->string_schemes){ + if (scheme.first == StringSchemeType::FSST) { + has_raw_fsst = true; + break; + } + } + + if (!has_raw_fsst) { + return true; + } + + u32 non_null_count = stats.tuple_count - stats.null_count; + u32 unique_count = stats.unique_count; + // Null may actually count as one unique string in the form of an empty string + return unique_count < non_null_count/2; +} + +u32 DynamicDictionary::getTotalLength(const u8 *src, u32 tuple_count, BitmapWrapper *nullmap) { + const auto &col_struct = *reinterpret_cast (src); + return col_struct.total_decompressed_size - ((tuple_count + 1) * sizeof(StringArrayViewer::Slot)); +} +// ------------------------------------------------------------------------------------- +} +} +} +} +// ------------------------------------------------------------------------------------- diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/string/DynamicDictionary.hpp b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/string/DynamicDictionary.hpp new file mode 100644 index 0000000..bdfcd68 --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/string/DynamicDictionary.hpp @@ -0,0 +1,50 @@ +#pragma once +#include "datablock/schemes/CScheme.hpp" +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +namespace cengine { +namespace db { +namespace v2 { +namespace string { +// ------------------------------------------------------------------------------------- +struct DynamicDictionaryStructure { + u32 total_decompressed_size; //i.e original input size + bool use_fsst; + u32 fsst_offsets_offset; + u32 lengths_offset; + u32 num_codes; + u32 codes_offset; + bool use_rle_optimized_path; + u8 codes_scheme; + u8 lengths_scheme; + u8 data[]; +}; +// ------------------------------------------------------------------------------------- +class DynamicDictionary : public StringScheme { +public: + double expectedCompressionRatio(StringStats &stats, u8 allowed_cascading_level) override; + bool usesFsst(const u8* src) override; + u32 compress(StringArrayViewer src, const BITMAP *nullmap, u8 *dest, StringStats &stats) override; + std::string fullDescription(const u8 *src) override; + bool isUsable(StringStats &stats) override; + u32 getDecompressedSize(const u8 *src, u32 tuple_count, BitmapWrapper *nullmap) override; + u32 getDecompressedSizeNoCopy(const u8 *src, u32 tuple_count, BitmapWrapper *nullmap) override; + u32 getTotalLength(const u8 *src, u32 tuple_count, BitmapWrapper *nullmap) override; + void decompress(u8 *dest, BitmapWrapper *nullmap, const u8 *src, u32 tuple_count, u32 level) override; + bool decompressNoCopy(u8 *dest, BitmapWrapper *nullmap, const u8 *src, u32 tuple_count, u32 level) override; + inline StringSchemeType schemeType() override + { + return staticSchemeType(); + } + inline static StringSchemeType staticSchemeType() + { + return StringSchemeType::S_DICT; + } +}; +// ------------------------------------------------------------------------------------- +} +} +} +} +// ------------------------------------------------------------------------------------- diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/string/Fsst.cpp b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/string/Fsst.cpp new file mode 100644 index 0000000..f45a30c --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/string/Fsst.cpp @@ -0,0 +1,107 @@ +#include "Fsst.hpp" +#include "spdlog/spdlog.h" +#include "datablock/schemes/CSchemePicker.hpp" + +namespace cengine::db::v2::string { + double Fsst::expectedCompressionRatio(StringStats &stats, u8) { + // TODO + return 1.0; + } + + u32 Fsst::compress(const cengine::StringArrayViewer src, const BITMAP *, u8 *dest, cengine::db::StringStats &stats) { + // TODO: For now we consider null values to be empty strings. Maybe it would be faster to not compress them + // and use roaring bitmap iterate to fill the StringArrayView on decompression + auto &col_struct = *reinterpret_cast (dest); + col_struct.total_decompressed_size = stats.total_size; + auto write_ptr = col_struct.data; + + // This was mostly adapted from what the DynamicDictionary used before. + auto input_string_buffers = std::unique_ptr(new u8 *[stats.tuple_count]); + auto input_string_lengths = std::unique_ptr(new u64[stats.tuple_count]); + auto output_string_buffers = std::unique_ptr(new u8 *[stats.tuple_count]); + auto output_string_lengths = std::unique_ptr(new u64[stats.tuple_count]); + + // Prepare necessary arrays + for (u32 str_i = 0; str_i < stats.tuple_count; str_i++) { + input_string_buffers[str_i] = const_cast(reinterpret_cast(src.get_pointer(str_i))); + input_string_lengths[str_i] = src.size(str_i); + } + + // Prepare decoder and write header + fsst_encoder_t *encoder = fsst_create(stats.tuple_count, input_string_lengths.get(), input_string_buffers.get(), 0); + die_if(fsst_export(encoder, write_ptr) > 0) + auto fsst_table_used_space = FSST_MAXHEADER; + write_ptr += fsst_table_used_space; + col_struct.strings_offset = write_ptr - col_struct.data; + + // Compress strings + // TODO whyever this is fake(?), fix it. + const u64 output_buffer_size = 7 + 4 * stats.total_length; // fake + if ( fsst_compress(encoder, stats.tuple_count, input_string_lengths.get(), input_string_buffers.get(), + output_buffer_size, write_ptr, + output_string_lengths.get(), output_string_buffers.get()) != stats.tuple_count ) { + throw Generic_Exception("FSST Compression failed !"); + } + u64 fsst_strings_used_space = output_string_lengths[stats.tuple_count - 1] + (output_string_buffers[stats.tuple_count - 1] - output_string_buffers[0]); + + spdlog::debug("FSST: strings_s = {}", static_cast(fsst_strings_used_space)); + spdlog::debug("FSST: string_size : before = {} after = {}", static_cast(stats.total_length), static_cast(fsst_table_used_space + fsst_strings_used_space)); + + col_struct.compressed_strings_size = fsst_strings_used_space; + write_ptr += fsst_strings_used_space; + col_struct.offsets_offset = write_ptr - col_struct.data; + + // TODO Think about how to handle the cascading_level properly + u32 used_space; + IntegerSchemePicker::compress(reinterpret_cast(src.slots_ptr), nullptr, write_ptr, stats.tuple_count+1, 2, used_space, col_struct.offsets_scheme); + write_ptr += used_space; + + return write_ptr - dest; + } + + u32 Fsst::getDecompressedSize(const u8 *src, u32, BitmapWrapper *) { + auto &col_struct = *reinterpret_cast (src); + return col_struct.total_decompressed_size; + } + + void Fsst::decompress(u8 *dest, BitmapWrapper *, const u8 *src, u32 tuple_count, u32 level) { + auto &col_struct = *reinterpret_cast (src); + auto dest_slots = reinterpret_cast(dest); + auto dest_write_ptr = dest + sizeof(StringArrayViewer::Slot) * (tuple_count + 1); + + // Decompress offsets + auto compressed_offsets = col_struct.data + col_struct.offsets_offset; + IntegerScheme &offsets_scheme = IntegerSchemePicker::MyTypeWrapper::getScheme(col_struct.offsets_scheme); + offsets_scheme.decompress(reinterpret_cast(dest_slots), nullptr, compressed_offsets, tuple_count + 1, level + 1); + + // Decompress strings + fsst_decoder_t decoder; + die_if(fsst_import(&decoder, const_cast(col_struct.data)) > 0) + auto compressed_strings = const_cast(col_struct.data + col_struct.strings_offset); + auto decompressed_strings_size = col_struct.total_decompressed_size - ((tuple_count+1) * sizeof(INTEGER)); + decompressed_strings_size += 4096; + dest_write_ptr += fsst_decompress(&decoder, col_struct.compressed_strings_size, compressed_strings, decompressed_strings_size, dest_write_ptr); + die_if(dest_write_ptr - dest == col_struct.total_decompressed_size) + } + + std::string Fsst::fullDescription(const u8 *src) { + auto &col_struct = *reinterpret_cast (src); + IntegerScheme &offsets_scheme = IntegerSchemePicker::MyTypeWrapper::getScheme(col_struct.offsets_scheme); + return this->selfDescription(src) + " -> ([int] offsets) " + offsets_scheme.fullDescription(col_struct.data + col_struct.offsets_offset); + } + + bool Fsst::isUsable(StringStats &stats) { + u32 non_null_count = stats.tuple_count - stats.null_count; + u32 unique_count = stats.unique_count; + // Null may actually count as one unique string in the form of an empty string + if (unique_count < non_null_count/2) { + return false; + } + return stats.total_length > FSST_THRESHOLD; + } + + u32 Fsst::getTotalLength(const u8 *src, u32 tuple_count, BitmapWrapper *) { + auto &col_struct = *reinterpret_cast (src); + return col_struct.total_decompressed_size - ((tuple_count + 1) * sizeof(StringArrayViewer::Slot)); + } +} diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/string/Fsst.hpp b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/string/Fsst.hpp new file mode 100644 index 0000000..a348a16 --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/string/Fsst.hpp @@ -0,0 +1,33 @@ +#pragma once +#include "datablock/schemes/CScheme.hpp" +#include "fsst.h" + +namespace cengine::db::v2::string { + struct FsstStructure { + u32 total_decompressed_size; + u32 compressed_strings_size; + u32 strings_offset; + u32 offsets_offset; + u8 offsets_scheme; + u8 data[]; + }; + + class Fsst : public StringScheme { + public: + double expectedCompressionRatio(StringStats &stats, u8 allowed_cascading_level) override; + u32 compress(StringArrayViewer src, const BITMAP *nullmap, u8 *dest, StringStats &stats) override; + u32 getDecompressedSize(const u8 *src, u32 tuple_count, BitmapWrapper *nullmap) override; + u32 getTotalLength(const u8 *src, u32 tuple_count, BitmapWrapper *nullmap) override; + void decompress(u8 *dest, BitmapWrapper *nullmap, const u8 *src, u32 tuple_count, u32 level) override; + bool isUsable(StringStats &stats) override; + std::string fullDescription(const u8 *src) override; + inline StringSchemeType schemeType() override + { + return staticSchemeType(); + } + inline static StringSchemeType staticSchemeType() + { + return StringSchemeType::FSST; + } + }; +} \ No newline at end of file diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/templated/DynamicDictionary.hpp b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/templated/DynamicDictionary.hpp new file mode 100644 index 0000000..584167e --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/templated/DynamicDictionary.hpp @@ -0,0 +1,200 @@ +#pragma once +#include "datablock/schemes/CScheme.hpp" +#include "datablock/schemes/CSchemePicker.hpp" +#include "utils/Utils.hpp" +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +namespace cengine { +namespace db { +namespace v2 { +__attribute__((packed)) struct DynamicDictionaryStructure { + u8 codes_scheme_code; + u32 codes_offset; + u8 data[]; +}; +// ------------------------------------------------------------------------------------- +// Used for integer and doubles +template +class TDynamicDictionary { +public: +// ------------------------------------------------------------------------------------- + static inline double expectedCompressionRatio(StatsType &stats, u8 allowed_cascading_level) + { + if(allowed_cascading_level <= 1) { + return 0; + } + u32 after_size = ((stats.unique_count * sizeof(NumberType)) + (stats.tuple_count * (Utils::getBitsNeeded(stats.unique_count) / 8.0))); + after_size += sizeof(DynamicDictionaryStructure) + 5; // 5 for PBP header + after_size += (stats.tuple_count) * 2 / 128; // TODO: find out the overhead of FastPFOR + return CD(stats.total_size) / CD(after_size); + } +// ------------------------------------------------------------------------------------- + static inline u32 compressColumn(const NumberType *src, const BITMAP *, u8 *dest, StatsType &stats, u8 allowed_cascading_level) + { + // Layout: DICT | CODES + auto &col_struct = *reinterpret_cast (dest); + // ------------------------------------------------------------------------------------- + // Write dictionary + auto dict_slots = reinterpret_cast(col_struct.data); + u32 distinct_i = 0; + for ( const auto &distinct_element : stats.distinct_values ) { + dict_slots[distinct_i] = distinct_element.first; + distinct_i++; + } + // ------------------------------------------------------------------------------------- + auto dict_begin = reinterpret_cast(col_struct.data); + auto dict_end = reinterpret_cast(col_struct.data) + distinct_i; + // ------------------------------------------------------------------------------------- + vector codes; + for ( u32 row_i = 0; row_i < stats.tuple_count; row_i++ ) { + auto it = std::lower_bound(dict_begin, dict_end, src[row_i]); + if ( it == dict_end ) { + die_if(stats.distinct_values.find(src[row_i]) != stats.distinct_values.end()); + } + die_if(it != dict_end); + codes.push_back(std::distance(dict_begin, it)); + } + // ------------------------------------------------------------------------------------- + // Compress codes + auto write_ptr = reinterpret_cast(dict_end); + col_struct.codes_offset = write_ptr - col_struct.data; + u32 used_space; + // For Number dictionaries, we only need FBP/PBP for coding, if any other schemes was useful beneath, + // it had to be rather chosen instead of X_DICT in the first place. + IntegerSchemePicker::compress(codes.data(), nullptr, write_ptr, codes.size(), allowed_cascading_level - 1, used_space, col_struct.codes_scheme_code, CB(IntegerSchemeType::X_FBP), "codes"); + // ------------------------------------------------------------------------------------- + spdlog::debug("X_DICT: codes_c = {} codes_s = {}", CI(col_struct.codes_scheme_code), CI(used_space)); + // ------------------------------------------------------------------------------------- + write_ptr += used_space; + // ------------------------------------------------------------------------------------- + return write_ptr - dest; + } +// ------------------------------------------------------------------------------------- + static inline void decompressColumn(NumberType *dest, BitmapWrapper *, const u8 *src, u32 tuple_count, u32 level) + { + auto &col_struct = *reinterpret_cast(src); + // ------------------------------------------------------------------------------------- + // Decode codes + thread_local std::vector> codes_v; + auto codes = get_level_data(codes_v, tuple_count + SIMD_EXTRA_ELEMENTS(INTEGER), level); + IntegerScheme &scheme = IntegerSchemePicker::MyTypeWrapper::getScheme(col_struct.codes_scheme_code); + scheme.decompress(codes, nullptr, col_struct.data + col_struct.codes_offset, tuple_count, level+1); + // ------------------------------------------------------------------------------------- + auto dict = reinterpret_cast(col_struct.data); + for (u32 i = 0; i < tuple_count; i++) { + dest[i] = dict[codes[i]]; + } + } +// ------------------------------------------------------------------------------------- + static inline string fullDescription(const u8 *src, const string& selfDescription) { + auto &col_struct = *reinterpret_cast(src); + IntegerScheme &scheme = IntegerSchemePicker::MyTypeWrapper::getScheme(col_struct.codes_scheme_code); + return selfDescription + " -> ([int] codes) " + scheme.fullDescription(col_struct.data + col_struct.codes_offset); + } +}; + +template<> +inline void TDynamicDictionary::decompressColumn(INTEGER *dest, BitmapWrapper *, const u8 *src, u32 tuple_count, u32 level) { + BTR_IFSIMD({ + static_assert(sizeof(*dest) == 4); + static_assert(SIMD_EXTRA_BYTES >= 4 * sizeof(__m256i)); + }) + + auto &col_struct = *reinterpret_cast(src); + + // Decode codes + thread_local std::vector> codes_v; + auto codes = get_level_data(codes_v, tuple_count + SIMD_EXTRA_ELEMENTS(INTEGER), level); + IntegerScheme &scheme = IntegerSchemePicker::MyTypeWrapper::getScheme(col_struct.codes_scheme_code); + scheme.decompress(codes, nullptr, col_struct.data + col_struct.codes_offset, tuple_count, level+1); + + auto dict = reinterpret_cast(col_struct.data); + u32 i = 0; +#ifdef BTR_USE_SIMD + if (tuple_count >= 32) { + while (i < tuple_count-31) { + // Load codes. + __m256i codes_0 = _mm256_loadu_si256(reinterpret_cast<__m256i *>(codes + 0)); + __m256i codes_1 = _mm256_loadu_si256(reinterpret_cast<__m256i *>(codes + 8)); + __m256i codes_2 = _mm256_loadu_si256(reinterpret_cast<__m256i *>(codes + 16)); + __m256i codes_3 = _mm256_loadu_si256(reinterpret_cast<__m256i *>(codes + 24)); + + // Gather values. + __m256i values_0 = _mm256_i32gather_epi32(dict, codes_0, 4); + __m256i values_1 = _mm256_i32gather_epi32(dict, codes_1, 4); + __m256i values_2 = _mm256_i32gather_epi32(dict, codes_2, 4); + __m256i values_3 = _mm256_i32gather_epi32(dict, codes_3, 4); + + // store values + _mm256_storeu_si256(reinterpret_cast<__m256i *>(dest + 0), values_0); + _mm256_storeu_si256(reinterpret_cast<__m256i *>(dest + 8), values_1); + _mm256_storeu_si256(reinterpret_cast<__m256i *>(dest + 16), values_2); + _mm256_storeu_si256(reinterpret_cast<__m256i *>(dest + 24), values_3); + + dest += 32; + codes += 32; + i += 32; + } + } +#endif + + while (i < tuple_count) { + *dest++ = dict[*codes++]; + i++; + } +} + +template<> +inline void TDynamicDictionary::decompressColumn(DOUBLE *dest, BitmapWrapper *, const u8 *src, u32 tuple_count, u32 level) { + BTR_IFSIMD({ + static_assert(sizeof(*dest) == 8); + static_assert(SIMD_EXTRA_BYTES >= 4 * sizeof(__m256d)); + }) + + auto &col_struct = *reinterpret_cast(src); + + // Decode codes + thread_local std::vector> codes_v; + auto codes = get_level_data(codes_v, tuple_count + SIMD_EXTRA_ELEMENTS(INTEGER), level); + IntegerScheme &scheme = IntegerSchemePicker::MyTypeWrapper::getScheme(col_struct.codes_scheme_code); + scheme.decompress(codes, nullptr, col_struct.data + col_struct.codes_offset, tuple_count, level+1); + + auto dict = reinterpret_cast(col_struct.data); + u32 i = 0; +#ifdef BTR_USE_SIMD + if (tuple_count >= 16) { + while (i < tuple_count-15) { + // Load codes + __m128i codes_0 = _mm_loadu_si128(reinterpret_cast<__m128i *>(codes + 0)); + __m128i codes_1 = _mm_loadu_si128(reinterpret_cast<__m128i *>(codes + 4)); + __m128i codes_2 = _mm_loadu_si128(reinterpret_cast<__m128i *>(codes + 8)); + __m128i codes_3 = _mm_loadu_si128(reinterpret_cast<__m128i *>(codes + 12)); + + // gather values + __m256d values_0 = _mm256_i32gather_pd(dict, codes_0, 8); + __m256d values_1 = _mm256_i32gather_pd(dict, codes_1, 8); + __m256d values_2 = _mm256_i32gather_pd(dict, codes_2, 8); + __m256d values_3 = _mm256_i32gather_pd(dict, codes_3, 8); + + // store values + _mm256_storeu_pd(dest + 0, values_0); + _mm256_storeu_pd(dest + 4, values_1); + _mm256_storeu_pd(dest + 8, values_2); + _mm256_storeu_pd(dest + 12, values_3); + + dest += 16; + codes += 16; + i += 16; + } + } +#endif + + while (i < tuple_count) { + *dest++ = dict[*codes++]; + i++; + } +} +} +} +} diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/templated/FOR.hpp b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/templated/FOR.hpp new file mode 100644 index 0000000..4bda674 --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/templated/FOR.hpp @@ -0,0 +1,58 @@ +//#pragma once +//#include "datablock/schemes/CScheme.hpp" +//#include "datablock/schemes/CSchemePicker.hpp" +//// ------------------------------------------------------------------------------------- +//// ------------------------------------------------------------------------------------- +//// ------------------------------------------------------------------------------------- +//namespace cengine { +//namespace db { +//namespace v2 { +//template +//struct FORStructure { +// NumberType bias; +// u8 next_scheme; +// u8 data[]; +//}; +//// ------------------------------------------------------------------------------------- +//template +//class TFOR { +//public: +// static inline double expectedCompressionRatio(StatsType &stats) +// { +// return 0; +// } +// // ------------------------------------------------------------------------------------- +// static inline u32 compressColumn(const NumberType *src, const BITMAP *, u8 *dest, StatsType &stats, u8 allowed_cascading_level) +// { +// auto &col_struct = *reinterpret_cast (dest); +// vector biased_output; +// // ------------------------------------------------------------------------------------- +// col_struct.bias = stats.min; +// for ( u32 row_i = 0; row_i < stats.tuple_count; row_i++ ) { +// if ( nullmap == nullptr || nullmap[row_i] ) { +// biased_output.push_back(src[row_i] - col_struct.bias); +// } else { +// biased_output.push_back(src[row_i]); +// } +// } +// // ------------------------------------------------------------------------------------- +// // Next Level compression +// auto write_ptr = col_struct.data; +// u32 used_space; +// IntegerSchemePicker::compress(biased_output.data(), nullmap, write_ptr, biased_output.size(), allowed_cascading_level - 1, used_space, col_struct.next_scheme); +// write_ptr += used_space; +// // ------------------------------------------------------------------------------------- +// return write_ptr - dest; +// } +// // ------------------------------------------------------------------------------------- +// static inline +// void decompressColumn(NumberType *dest, BITMAP *, const u8 *src, u32 tuple_count) +// { +// +// } +//} +//// ------------------------------------------------------------------------------------- +//}; +//} +//} +//} \ No newline at end of file diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/templated/Frequency.hpp b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/templated/Frequency.hpp new file mode 100644 index 0000000..6061a56 --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/templated/Frequency.hpp @@ -0,0 +1,144 @@ +#pragma once + +#include +#include "datablock/schemes/CScheme.hpp" +#include "datablock/schemes/CSchemePicker.hpp" +// ------------------------------------------------------------------------------------- +#include "roaring/roaring.hh" +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +namespace cengine { +namespace db { +namespace v2 { +template +struct FrequencyStructure { + NumberType top_value; + u32 exceptions_offset; + u8 next_scheme; + u8 data[]; +}; +// ------------------------------------------------------------------------------------- +template +class TFrequency { +public: + static inline double expectedCompressionRatio(StatsType &stats) + { + // Threshold : stats.max larger than 1 byte (b/c PBP would do just as well as frequency) + // top_1 occurrence pct >= 90% + // TODO: run it on a sample + if ( stats.unique_count <= 1 ) + return 0; + + if ( CD(stats.null_count) * 100.0 / CD(stats.tuple_count) >= 90 ) { + return stats.tuple_count - 1; + } + + u32 occurence_count = stats.distinct_values.begin()->second; + for ( const auto &t : stats.distinct_values ) { + if ( t.second > occurence_count ) + occurence_count = t.second; + } + + if ( CD(occurence_count) * 100.0 / CD(stats.tuple_count) >= 90 ) { + if ( stats.max >= (1 << 8)) { + return stats.tuple_count - 1; + } + } + return 0; + } +// ------------------------------------------------------------------------------------- + static inline u32 compressColumn(const NumberType *src, const BITMAP *nullmap, u8 *dest, StatsType &stats, u8 allowed_cascading_level) + { + auto &col_struct = *new(dest) (FrequencyStructure); + // ------------------------------------------------------------------------------------- + if ( CD(stats.null_count) * 100.0 / CD(stats.tuple_count) >= 90 ) { + col_struct.top_value = NULL_CODE; + } else { + col_struct.top_value = stats.distinct_values.begin()->first; + u32 occurence_count = stats.distinct_values.begin()->second; + for ( const auto &t : stats.distinct_values ) { + if ( t.second > occurence_count ) { + occurence_count = t.second; + col_struct.top_value = t.first; + } + } + } + // ------------------------------------------------------------------------------------- + vector exceptions; + Roaring exceptions_bitmap; + for ( u32 row_i = 0; row_i < stats.tuple_count; row_i++ ) { + if ( src[row_i] != col_struct.top_value && (nullmap == nullptr || nullmap[row_i])) { + exceptions.push_back(src[row_i]); + exceptions_bitmap.add(row_i); + } + } + die_if(exceptions_bitmap.cardinality() == exceptions.size()); + // ------------------------------------------------------------------------------------- + auto write_ptr = col_struct.data; + // ------------------------------------------------------------------------------------- + exceptions_bitmap.runOptimize(); + exceptions_bitmap.setCopyOnWrite(true); + write_ptr += exceptions_bitmap.write(reinterpret_cast(write_ptr), false); + col_struct.exceptions_offset = write_ptr - col_struct.data; + // ------------------------------------------------------------------------------------- + { + // Compress exceptions + u32 used_space; + CSchemePicker::compress(exceptions.data(), nullptr, write_ptr, exceptions.size(), allowed_cascading_level - 1, used_space, col_struct.next_scheme, AUTO_SCHEME, "exceptions"); + write_ptr += used_space; + } + // ------------------------------------------------------------------------------------- + return write_ptr - dest; + } +// ------------------------------------------------------------------------------------- +static inline +void decompressColumn(NumberType *dest, BitmapWrapper *, const u8 *src, u32 tuple_count, u32 level) + { + const auto &col_struct = *reinterpret_cast *> (src); + // ------------------------------------------------------------------------------------- + Roaring exceptions_bitmap = Roaring::read(reinterpret_cast(col_struct.data), false); + thread_local std::vector> exceptions_v; + auto exceptions = get_level_data(exceptions_v, exceptions_bitmap.cardinality() + SIMD_EXTRA_ELEMENTS(NumberType), level); + if ( exceptions_bitmap.cardinality() > 0 ) { + CSchemePicker::MyTypeWrapper::getScheme( + col_struct.next_scheme).decompress( + exceptions, nullptr, col_struct.data + col_struct.exceptions_offset, + exceptions_bitmap.cardinality(), level+1); + } + // ------------------------------------------------------------------------------------- + // First write the top value to every single entry + for (u32 i = 0; i < tuple_count; i++) { + dest[i] = col_struct.top_value; + } + + // Now fix every single entry that is an exception + std::pair param = {dest, exceptions}; + exceptions_bitmap.iterate([](uint32_t value, void *param) { + // TODO are the values actually being sorted here??? If not this won't work. + auto p = reinterpret_cast *>(param); + //dest[value] = *exceptions_ptr++; + p->first[value] = *(p->second); + p->second++; + return true; + }, + ¶m + ); + } +// ------------------------------------------------------------------------------------- +static inline +string fullDescription(const u8 *src, const string& selfDescription) { + const auto &col_struct = *reinterpret_cast *> (src); + auto result = selfDescription; + + Roaring exceptions_bitmap = Roaring::read(reinterpret_cast(col_struct.data), false); + if ( exceptions_bitmap.cardinality() > 0 ) { + auto &scheme = CSchemePicker::MyTypeWrapper::getScheme(col_struct.next_scheme); + result += " -> ([NumberType] exceptions) " + scheme.fullDescription(col_struct.data + col_struct.exceptions_offset); + } + + return result; +} +}; +} +} +} \ No newline at end of file diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/templated/RLE.hpp b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/templated/RLE.hpp new file mode 100644 index 0000000..a8a494f --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/schemes/v2/templated/RLE.hpp @@ -0,0 +1,211 @@ +#pragma once +#include "datablock/schemes/CScheme.hpp" +#include "datablock/schemes/CSchemePicker.hpp" +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +namespace cengine { +namespace db { +namespace v2 { +struct RLEStructure { + u32 runs_count; + u32 runs_count_offset; + u8 values_scheme_code; + u8 counts_scheme_code; + u8 data[]; +}; +// ------------------------------------------------------------------------------------- +template +class TRLE { +public: +// ------------------------------------------------------------------------------------- + static inline u32 compressColumn(const NumberType *src, const BITMAP *nullmap, u8 *dest, StatsType &stats, u8 allowed_cascading_level, u8 force_values = AUTO_SCHEME, u8 force_counts = AUTO_SCHEME) + { + auto &col_struct = *reinterpret_cast (dest); + // ------------------------------------------------------------------------------------- + std::vector rle_values; + std::vector rle_count; + // ------------------------------------------------------------------------------------- + // RLE encoding + NumberType last_item = src[0]; + INTEGER count = 1; + for ( uint32_t row_i = 1; row_i < stats.tuple_count; row_i++ ) { + if ( src[row_i] == last_item || (nullmap != nullptr && !nullmap[row_i])) { // the null trick brought no compression benefits + count++; + } else { + rle_count.push_back(count); + rle_values.push_back(last_item); + last_item = src[row_i]; + count = 1; + } + } + rle_count.push_back(count); + rle_values.push_back(last_item); + // ------------------------------------------------------------------------------------- + col_struct.runs_count = rle_count.size(); + die_if(rle_count.size() == rle_values.size()); + // ------------------------------------------------------------------------------------- + auto write_ptr = col_struct.data; + // ------------------------------------------------------------------------------------- + // Compress values + { + u32 used_space; + CSchemePicker::compress(rle_values.data(), nullptr, write_ptr, rle_values.size(), allowed_cascading_level - 1, used_space, col_struct.values_scheme_code, AUTO_SCHEME, "values"); + write_ptr += used_space; + // ------------------------------------------------------------------------------------- + spdlog::debug("X_RLE: values_c = {} values_s = {}", CI(col_struct.values_scheme_code), CI(used_space)); + // ------------------------------------------------------------------------------------- + } + // ------------------------------------------------------------------------------------- + // Compress counts + { + col_struct.runs_count_offset = write_ptr - col_struct.data; + u32 used_space; + IntegerSchemePicker::compress(rle_count.data(), nullptr, write_ptr, rle_count.size(), allowed_cascading_level - 1, used_space, col_struct.counts_scheme_code, force_counts, "counts"); + write_ptr += used_space; + // TODO why is used_space added 2 times???? + write_ptr += used_space; + // ------------------------------------------------------------------------------------- + spdlog::debug("X_RLE: count_c = {} count_s = {}", CI(col_struct.counts_scheme_code), CI(used_space)); + // ------------------------------------------------------------------------------------- + } + // ------------------------------------------------------------------------------------- + return write_ptr - dest; + } + + static inline string fullDescription(const u8 *src, const string& selfDescription) { + const auto &col_struct = *reinterpret_cast (src); + string result = selfDescription; + + auto &value_scheme = TypeWrapper::getScheme(col_struct.values_scheme_code); + result += "\n\t-> ([valueType] values) " + value_scheme.fullDescription(col_struct.data); + + IntegerScheme &counts_scheme = TypeWrapper::getScheme(col_struct.counts_scheme_code); + result += "\n\t-> ([int] counts) " + counts_scheme.fullDescription(col_struct.data + col_struct.runs_count_offset); + + return result; + } +// ------------------------------------------------------------------------------------- + static inline void decompressColumn(NumberType *dest, BitmapWrapper *, const u8 *src, u32 tuple_count, u32 level) + { + throw Generic_Exception("Unsupported templated specialization"); + } + + static inline u32 decompressRuns(NumberType *values, INTEGER *counts, BitmapWrapper *, const u8 *src, u32 tuple_count, u32 level) + { + const auto &col_struct = *reinterpret_cast (src); + auto &value_scheme = TypeWrapper::getScheme(col_struct.values_scheme_code); + value_scheme.decompress(values, nullptr, col_struct.data, col_struct.runs_count, level+1); + + IntegerScheme &counts_scheme = TypeWrapper::getScheme(col_struct.counts_scheme_code); + counts_scheme.decompress(counts, nullptr, col_struct.data + col_struct.runs_count_offset, + col_struct.runs_count, level+1); + + return col_struct.runs_count; + } +// ------------------------------------------------------------------------------------- +}; + +template<> +inline void TRLE::decompressColumn(INTEGER *dest, BitmapWrapper *, const u8 *src, u32 tuple_count, u32 level) { + static_assert(sizeof(*dest) == 4); + + const auto &col_struct = *reinterpret_cast (src); + // ------------------------------------------------------------------------------------- + // Decompress values + thread_local std::vector> values_v; + auto values = get_level_data(values_v, col_struct.runs_count + SIMD_EXTRA_ELEMENTS(INTEGER), level); + { + IntegerScheme &scheme = TypeWrapper::getScheme(col_struct.values_scheme_code); + scheme.decompress(values, nullptr, col_struct.data, col_struct.runs_count, level+1); + } + // ------------------------------------------------------------------------------------- + // Decompress counts + thread_local std::vector> counts_v; + auto counts = get_level_data(counts_v, col_struct.runs_count + SIMD_EXTRA_ELEMENTS(INTEGER), level); + { + IntegerScheme &scheme = TypeWrapper::getScheme(col_struct.counts_scheme_code); + scheme.decompress(counts, nullptr, col_struct.data + col_struct.runs_count_offset, + col_struct.runs_count, level+1); + } + // ------------------------------------------------------------------------------------- + auto write_ptr = dest; +#ifdef BTR_USE_SIMD + for ( u32 run_i = 0; run_i < col_struct.runs_count; run_i++ ) { + auto target_ptr = write_ptr + counts[run_i]; + + /* + * I tried several variation for vectorizing this. Using AVX2 directly is the fastest + * even when there are many very short runs. The penalty of branching simply outweighs + * the few instructions saved by not using AVX2 for short runs + */ + // set is a sequential operation + __m256i vec = _mm256_set1_epi32(values[run_i]); + while (write_ptr < target_ptr) { + // store is performed in a single cycle + _mm256_storeu_si256(reinterpret_cast<__m256i *>(write_ptr), vec); + write_ptr += 8; + } + write_ptr = target_ptr; + } +#else + for ( u32 run_i = 0; run_i < col_struct.runs_count; run_i++ ) { + auto val = values[run_i]; + auto target_ptr = write_ptr + counts[run_i]; + while (write_ptr != target_ptr) { + *write_ptr++ = val; + } + } +#endif +} + +template<> +inline void TRLE::decompressColumn(DOUBLE *dest, BitmapWrapper *, const u8 *src, u32 tuple_count, u32 level) { + static_assert(sizeof(*dest) == 8); + + const auto &col_struct = *reinterpret_cast (src); + // ------------------------------------------------------------------------------------- + // Decompress values + thread_local std::vector> values_v; + auto values = get_level_data(values_v, col_struct.runs_count + SIMD_EXTRA_ELEMENTS(DOUBLE), level); + { + DoubleScheme &scheme = TypeWrapper::getScheme(col_struct.values_scheme_code); + scheme.decompress(values, nullptr, col_struct.data, col_struct.runs_count, level+1); + } + // ------------------------------------------------------------------------------------- + // Decompress counts + thread_local std::vector> counts_v; + auto counts = get_level_data(counts_v, col_struct.runs_count + SIMD_EXTRA_ELEMENTS(INTEGER), level); + { + IntegerScheme &scheme = TypeWrapper::getScheme(col_struct.counts_scheme_code); + scheme.decompress(counts, nullptr, col_struct.data + col_struct.runs_count_offset, + col_struct.runs_count, level+1); + } + // ------------------------------------------------------------------------------------- + auto write_ptr = dest; +#ifdef BTR_USE_SIMD + for ( u32 run_i = 0; run_i < col_struct.runs_count; run_i++ ) { + auto target_ptr = write_ptr + counts[run_i]; + + // set is a sequential operation + __m256d vec = _mm256_set1_pd(values[run_i]); + while (write_ptr < target_ptr) { + // store is performed in a single cycle + _mm256_storeu_pd(write_ptr, vec); + write_ptr += 4; + } + write_ptr = target_ptr; + } +#else + for ( u32 run_i = 0; run_i < col_struct.runs_count; run_i++ ) { + auto val = values[run_i]; + auto target_ptr = write_ptr + counts[run_i]; + while (write_ptr != target_ptr) { + *write_ptr++ = val; + } + } +#endif +} +} +} +} diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/stats/NumberStats.hpp b/benchmarks/analyze_better_blocks/cengine/datablock/stats/NumberStats.hpp new file mode 100644 index 0000000..9e65bb4 --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/stats/NumberStats.hpp @@ -0,0 +1,134 @@ +#pragma once +#include "Units.hpp" +// ------------------------------------------------------------------------------------- +#include +#include +#include +#include +#include +// ------------------------------------------------------------------------------------- +namespace cengine { +namespace db { +// ------------------------------------------------------------------------------------- +template +struct NumberStats { +public: + NumberStats(const T *src, const BITMAP *bitmap, u32 tuple_count) + : src(src) + , bitmap(bitmap) + , tuple_count(tuple_count) {} + // ------------------------------------------------------------------------------------- + const T *src; + const BITMAP *bitmap; + std::map distinct_values; + T min; + T max; + NumberStats() = delete; + // ------------------------------------------------------------------------------------- + u32 tuple_count; + u32 total_size; + u32 null_count; + u32 unique_count; + u32 set_count; + u32 average_run_length; + bool is_sorted; + // ------------------------------------------------------------------------------------- + tuple, vector> samples(u32 n, u32 length) + { + // ------------------------------------------------------------------------------------- + std::random_device rd; //Will be used to obtain a seed for the random number engine + std::mt19937 gen(rd()); //Standard mersenne_twister_engine seeded with rd() + // ------------------------------------------------------------------------------------- + // TODO : Construction Site !! need a better theory and algorithm for sampling + // Constraints: RLE(runs), nulls, uniqueness, + // naive approach to pick enough elements that approximate stratified sample + // run ~8 normal rounds, then check if the + vector compiled_values; + vector compiled_bitmap; + // ------------------------------------------------------------------------------------- + if ( tuple_count <= n * length ) { + compiled_values.insert(compiled_values.end(), src, src + tuple_count); + if ( bitmap != nullptr ) { + compiled_bitmap.insert(compiled_bitmap.end(), bitmap, bitmap + tuple_count); + } else { + compiled_bitmap.insert(compiled_bitmap.end(), tuple_count, 1); + } + } else { + u32 separator = tuple_count / n; // how big is the slice of the input, of which we take a part.... + u32 remainder = tuple_count % n; + for ( u32 sample_i = 0; sample_i < n; sample_i++ ) { + u32 range_end = ((sample_i == n - 1) ? (separator + remainder) : separator) - length; + std::uniform_int_distribution<> dis(0, range_end); + u32 partition_begin = sample_i * separator + dis(gen); + // (sample_i * separator, (sample_i + 1 ) * separator) range to pick from + compiled_values.insert(compiled_values.end(), src + partition_begin, src + partition_begin + length); + if ( bitmap == nullptr ) { + compiled_bitmap.insert(compiled_bitmap.end(), length, 1); + } else { + compiled_bitmap.insert(compiled_bitmap.end(), bitmap + partition_begin, bitmap + partition_begin + length); + } + } + } + + return std::make_tuple(compiled_values, compiled_bitmap); + } + // ------------------------------------------------------------------------------------- + static NumberStats generateStats(const T *src, const BITMAP *nullmap, u32 tuple_count) + { + NumberStats stats(src, nullmap, tuple_count); + // ------------------------------------------------------------------------------------- + stats.tuple_count = tuple_count; + stats.total_size = tuple_count * sizeof(T); + stats.null_count = 0; + stats.average_run_length = 0; + stats.is_sorted = true; + // ------------------------------------------------------------------------------------- + bool is_init_value_initialized = false; + // ------------------------------------------------------------------------------------- + // Let NULL_CODE (0) of null values also taken into stats consideration + // ------------------------------------------------------------------------------------- + T last_value; + u32 run_count = 0; + // ------------------------------------------------------------------------------------- + for ( u64 row_i = 0; row_i < tuple_count; row_i++ ) { + if ( !is_init_value_initialized ) { + stats.min = stats.max = last_value = src[0]; + is_init_value_initialized = true; + } + // ------------------------------------------------------------------------------------- + auto current_value = src[row_i]; + if ( current_value != last_value && (nullmap == nullptr || nullmap[row_i])) { + if(current_value < last_value) { + stats.is_sorted = false; + } + last_value = current_value; + run_count++; + } + if ( stats.distinct_values.find(current_value) == stats.distinct_values.end()) { + stats.distinct_values.insert({current_value, 1}); + } else { + stats.distinct_values[current_value]++; + } + if ( current_value > stats.max ) { + stats.max = current_value; + } else if ( current_value < stats.min ) { + stats.min = current_value; + } + if ( nullmap != nullptr && !nullmap[row_i] ) { + stats.null_count++; + continue; + } + } + run_count++; + // ------------------------------------------------------------------------------------- + stats.average_run_length = CD(tuple_count) / CD(run_count); + stats.unique_count = stats.distinct_values.size(); + stats.set_count = stats.tuple_count - stats.null_count; + // ------------------------------------------------------------------------------------- + return stats; + } +}; +// ------------------------------------------------------------------------------------- +} +} +// ------------------------------------------------------------------------------------- diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/stats/StringStats.cpp b/benchmarks/analyze_better_blocks/cengine/datablock/stats/StringStats.cpp new file mode 100644 index 0000000..c1ea60b --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/stats/StringStats.cpp @@ -0,0 +1,39 @@ +#include "StringStats.hpp" +// ------------------------------------------------------------------------------------- +namespace cengine { +namespace db { +// ------------------------------------------------------------------------------------- +StringStats StringStats::generateStats(const cengine::StringArrayViewer src, const BITMAP *nullmap, u32 tuple_count, SIZE column_data_size) +{ + // ------------------------------------------------------------------------------------- + // Collect stats + StringStats stats; + // ------------------------------------------------------------------------------------- + stats.tuple_count = tuple_count; + stats.total_size = column_data_size; + stats.total_length = 0; + stats.total_unique_length = 0; + stats.null_count = 0; + // ------------------------------------------------------------------------------------- + for ( u64 row_i = 0; row_i < tuple_count; row_i++ ) { + if ( nullmap == nullptr || nullmap[row_i] ) { + auto current_value = src(row_i); + if ( stats.distinct_values.find(current_value) == stats.distinct_values.end()) { + stats.distinct_values.insert(current_value); + stats.total_unique_length += current_value.length(); + } + stats.total_length += current_value.size(); + } else { + stats.distinct_values.insert(""); + stats.null_count++; + continue; + } + } + // ------------------------------------------------------------------------------------- + stats.unique_count = stats.distinct_values.size(); + stats.set_count = stats.tuple_count - stats.null_count; + return stats; +} +// ------------------------------------------------------------------------------------- +} +} \ No newline at end of file diff --git a/benchmarks/analyze_better_blocks/cengine/datablock/stats/StringStats.hpp b/benchmarks/analyze_better_blocks/cengine/datablock/stats/StringStats.hpp new file mode 100644 index 0000000..1212e92 --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/datablock/stats/StringStats.hpp @@ -0,0 +1,28 @@ +#pragma once +#include "Units.hpp" +#include "storage/StringArrayViewer.hpp" +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +#include +// ------------------------------------------------------------------------------------- +namespace cengine { +namespace db { +// ------------------------------------------------------------------------------------- +struct StringStats { + std::set distinct_values; + // ------------------------------------------------------------------------------------- + u32 total_size; // everything in the column including slots + u32 total_length; // only string starting from slots end + u32 total_unique_length; // only the unique (dict) strings + u32 tuple_count; + // ------------------------------------------------------------------------------------- + u32 null_count; + u32 unique_count; + u32 set_count; + // ------------------------------------------------------------------------------------- + static StringStats generateStats(const StringArrayViewer src, const BITMAP *nullmap, u32 tuple_count, SIZE column_data_size); +}; +// ------------------------------------------------------------------------------------- +} +} +// ------------------------------------------------------------------------------------- \ No newline at end of file diff --git a/benchmarks/analyze_better_blocks/cengine/extern/BZIP2.cpp b/benchmarks/analyze_better_blocks/cengine/extern/BZIP2.cpp new file mode 100644 index 0000000..78e5823 --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/extern/BZIP2.cpp @@ -0,0 +1,18 @@ +#include "BZIP2.hpp" +// ------------------------------------------------------------------------------------- +#include "bzlib.h" +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +u32 BZIP2::getCompressedSize(u8 *src, SIZE size) +{ + const SIZE bzip2_buffer_size = size * 2; + vector bzip_buffer(bzip2_buffer_size); + unsigned int bzip_dest_len = bzip2_buffer_size; + auto ret_code = BZ2_bzBuffToBuffCompress(bzip_buffer.data(), &bzip_dest_len, reinterpret_cast(src), size, 9, 0, 30); + if ( ret_code != BZ_OK ) { + throw Generic_Exception("BZIP2 compression failed , error code = " + std::to_string(ret_code)); + } + die_if(bzip_dest_len != 0); + return bzip_dest_len; +} +// ------------------------------------------------------------------------------------- \ No newline at end of file diff --git a/benchmarks/analyze_better_blocks/cengine/extern/BZIP2.hpp b/benchmarks/analyze_better_blocks/cengine/extern/BZIP2.hpp new file mode 100644 index 0000000..9cea97d --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/extern/BZIP2.hpp @@ -0,0 +1,8 @@ +#pragma once +#include "Units.hpp" +// ------------------------------------------------------------------------------------- +class BZIP2 { +public: + static u32 getCompressedSize(u8 *src, SIZE size); +}; +// ------------------------------------------------------------------------------------- diff --git a/benchmarks/analyze_better_blocks/cengine/extern/FastPFOR.cpp b/benchmarks/analyze_better_blocks/cengine/extern/FastPFOR.cpp new file mode 100644 index 0000000..c1c214f --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/extern/FastPFOR.cpp @@ -0,0 +1,60 @@ +#include "FastPFOR.hpp" +#include "SIMD.hpp" + +// fastpfor +#include +#include +#include +#include + +#include "Exceptions.hpp" + + +// ------------------------------------------------------------------------------------- +template<> +struct LemiereImpl::impl { + // using codec_t = BTR_IFELSESIMD(FastPForLib::SIMDFastPFor<8>, FastPForLib::FastPFor<8>); + using codec_t = FastPForLib::SIMDFastPFor<8>; + FastPForLib::CompositeCodec codec; +}; +// ------------------------------------------------------------------------------------- +template<> +struct LemiereImpl::impl { + // TODO Adnan did not use SIMDBinaryPacking in the original? ask him why + FastPForLib::CompositeCodec, FastPForLib::VariableByte> codec; +}; +// ------------------------------------------------------------------------------------- +template +LemiereImpl::LemiereImpl() : pImpl(new LemiereImpl::impl) {} +// ------------------------------------------------------------------------------------- +template +LemiereImpl::~LemiereImpl() = default; +// ------------------------------------------------------------------------------------- +template +u32 LemiereImpl::compress(const data_t* src, u32 count, data_t* dest, SIZE& outsize) { + auto& codec = this->pImpl->codec; + codec.encodeArray(src, count, dest, outsize); + return outsize; +} +// ------------------------------------------------------------------------------------- +template +const typename LemiereImpl::data_t* LemiereImpl::decompress(const data_t* src, u32 count, data_t* dest, SIZE& outsize) { + auto& codec = this->pImpl->codec; + return codec.decodeArray(src, count, dest, outsize); +} +// ------------------------------------------------------------------------------------- +template +void LemiereImpl::applyDelta(data_t* src, size_t count) { + using namespace FastPForLib; + FastPForLib::Delta::deltaSIMD(src, count); +} +// ------------------------------------------------------------------------------------- +template +void LemiereImpl::revertDelta(data_t* src, size_t count) { + using namespace FastPForLib; + FastPForLib::Delta::inverseDeltaSIMD(src, count); +} +// ------------------------------------------------------------------------------------- +template struct LemiereImpl; +template struct LemiereImpl; +// ------------------------------------------------------------------------------------- diff --git a/benchmarks/analyze_better_blocks/cengine/extern/FastPFOR.hpp b/benchmarks/analyze_better_blocks/cengine/extern/FastPFOR.hpp new file mode 100644 index 0000000..dd586b7 --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/extern/FastPFOR.hpp @@ -0,0 +1,35 @@ +#pragma once +#include "Units.hpp" +// ------------------------------------------------------------------------------------- +#include +#include +// ------------------------------------------------------------------------------------- +// the linker breaks when including the fastpfor library multiple times. +// thus, provide a wrapper for the functions we use from it. +// ------------------------------------------------------------------------------------- +enum class FastPForCodec { FPF, FBP }; +// ------------------------------------------------------------------------------------- +template +struct LemiereImpl { + using data_t = u32; + // ------------------------------------------------------------------------------------- + LemiereImpl(); + ~LemiereImpl(); + // ------------------------------------------------------------------------------------- + u32 compress(const data_t* src, u32 count, data_t* dest, SIZE& outsize); + const data_t* decompress(const data_t* src, + u32 count, + data_t* dest, + SIZE& outsize); + // ------------------------------------------------------------------------------------- + static void applyDelta(data_t* src, size_t count); + static void revertDelta(data_t* src, size_t count); + // ------------------------------------------------------------------------------------- + private: + struct impl; + std::unique_ptr pImpl; +}; +// ------------------------------------------------------------------------------------- +using FPFor = LemiereImpl; +using FBPImpl = LemiereImpl; +// ------------------------------------------------------------------------------------- diff --git a/benchmarks/analyze_better_blocks/cengine/extern/LZ4.cpp b/benchmarks/analyze_better_blocks/cengine/extern/LZ4.cpp new file mode 100644 index 0000000..6f76846 --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/extern/LZ4.cpp @@ -0,0 +1,16 @@ +#include "LZ4.hpp" +// ------------------------------------------------------------------------------------- +#include "lz4.h" +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +u32 LZ4::getCompressedSize(u8 *src, SIZE size) +{ + const SIZE LZ4_buffer_size = LZ4_compressBound(size); + vector dest_buffer(LZ4_buffer_size); + auto after_size = LZ4_compress_default(const_cast(reinterpret_cast(src)), dest_buffer.data(), size, LZ4_buffer_size); + if ( after_size == 0 ) { + throw Generic_Exception("LZ4 compression failed"); + } + return after_size; +} +// ------------------------------------------------------------------------------------- \ No newline at end of file diff --git a/benchmarks/analyze_better_blocks/cengine/extern/LZ4.hpp b/benchmarks/analyze_better_blocks/cengine/extern/LZ4.hpp new file mode 100644 index 0000000..591680b --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/extern/LZ4.hpp @@ -0,0 +1,8 @@ +#pragma once +#include "Units.hpp" +// ------------------------------------------------------------------------------------- +class LZ4 { +public: + static u32 getCompressedSize(u8 *src, SIZE size); +}; +// ------------------------------------------------------------------------------------- diff --git a/benchmarks/analyze_better_blocks/cengine/extern/XZ.cpp b/benchmarks/analyze_better_blocks/cengine/extern/XZ.cpp new file mode 100644 index 0000000..c70e7d1 --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/extern/XZ.cpp @@ -0,0 +1,9 @@ +#include "XZ.hpp" +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +u32 XZ::getCompressedSize(u8 *src, SIZE size) +{ + // TODO +} +// ------------------------------------------------------------------------------------- \ No newline at end of file diff --git a/benchmarks/analyze_better_blocks/cengine/extern/XZ.hpp b/benchmarks/analyze_better_blocks/cengine/extern/XZ.hpp new file mode 100644 index 0000000..5d4a29e --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/extern/XZ.hpp @@ -0,0 +1,8 @@ +#pragma once +#include "Units.hpp" +// ------------------------------------------------------------------------------------- +class XZ { +public: + static u32 getCompressedSize(u8 *src, SIZE size); +}; +// ------------------------------------------------------------------------------------- diff --git a/benchmarks/analyze_better_blocks/cengine/parser/CSVParser.hpp b/benchmarks/analyze_better_blocks/cengine/parser/CSVParser.hpp new file mode 100644 index 0000000..e5c68af --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/parser/CSVParser.hpp @@ -0,0 +1,375 @@ +// https://github.com/AriaFallah/csv-parser +#ifndef ARIA_CSV_H +#define ARIA_CSV_H + +#include +#include +#include +#include +#include + +namespace aria { +namespace csv { +enum class Term : char { + CRLF = -2 +}; +enum class FieldType { + DATA, + ROW_END, + CSV_END +}; +using CSV = std::vector>; + +// Checking for '\n', '\r', and '\r\n' by default +inline bool operator==(const char c, const Term t) +{ + switch ( t ) { + case Term::CRLF: + return c == '\r' || c == '\n'; + default: + return static_cast(t) == c; + } +} + +inline bool operator!=(const char c, const Term t) +{ + return !(c == t); +} + +// Wraps returned fields so we can also indicate +// that we hit row endings or the end of the csv itself +struct Field { + explicit Field(FieldType t) + : type(t) + , data(nullptr) {} + explicit Field(const std::string &str) + : type(FieldType::DATA) + , data(&str) {} + + FieldType type; + const std::string *data; +}; + +// Reads and parses lines from a csv file +class CsvParser { +private: + // CSV state for state machine + enum class State { + START_OF_FIELD, + IN_FIELD, + IN_QUOTED_FIELD, + IN_ESCAPED_QUOTE, + END_OF_ROW, + EMPTY + }; + State m_state = State::START_OF_FIELD; + + // Configurable attributes + char m_quote = '"'; + char m_delimiter = ','; + Term m_terminator = Term::CRLF; + std::istream &m_input; + + // Buffer capacities + static constexpr int FIELDBUF_CAP = 1024; + static constexpr int INPUTBUF_CAP = 1024 * 128; + + // Buffers + std::string m_fieldbuf{}; + char m_inputbuf[INPUTBUF_CAP]{}; + + // Misc + bool m_eof = false; + size_t m_cursor = INPUTBUF_CAP; + size_t m_inputbuf_size = INPUTBUF_CAP; + std::streamoff m_scanposition = -INPUTBUF_CAP; +public: + // Creates the CSV parser which by default, splits on commas, + // uses quotes to escape, and handles CSV files that end in either + // '\r', '\n', or '\r\n'. + explicit CsvParser(std::istream &input) + : m_input(input) + { + // Reserve space upfront to improve performance + m_fieldbuf.reserve(FIELDBUF_CAP); + if ( !m_input.good()) { + throw std::runtime_error("Something is wrong with input stream"); + } + } + + // Change the quote character + CsvParser "e(char c) noexcept + { + m_quote = c; + return *this; + } + + // Change the delimiter character + CsvParser &delimiter(char c) noexcept + { + m_delimiter = c; + return *this; + } + + // Change the terminator character + CsvParser &terminator(char c) noexcept + { + m_terminator = static_cast(c); + return *this; + } + + // The parser is in the empty state when there are + // no more tokens left to read from the input buffer + bool empty() + { + return m_state == State::EMPTY; + } + + // Not the actual position in the stream (its buffered) just the + // position up to last availiable token + std::streamoff position() const + { + return m_scanposition + static_cast(m_cursor); + } + + // Reads a single field from the CSV + Field next_field() + { + if ( empty()) { + return Field(FieldType::CSV_END); + } + m_fieldbuf.clear(); + + // This loop runs until either the parser has + // read a full field or until there's no tokens left to read + for ( ;; ) { + char *maybe_token = top_token(); + + // If we're out of tokens to read return whatever's left in the + // field and row buffers. If there's nothing left, return null. + if ( !maybe_token ) { + m_state = State::EMPTY; + return !m_fieldbuf.empty() ? Field(m_fieldbuf) : Field(FieldType::CSV_END); + } + + // Parsing the CSV is done using a finite state machine + char c = *maybe_token; + switch ( m_state ) { + case State::START_OF_FIELD: + m_cursor++; + if ( c == m_terminator ) { + handle_crlf(c); + return Field(FieldType::ROW_END); + } + + if ( c == m_quote && false ) {// forget about quoting, our csv input is not legal anyway + m_state = State::IN_QUOTED_FIELD; + } else if ( c == m_delimiter ) { + return Field(m_fieldbuf); + } else { + m_state = State::IN_FIELD; + m_fieldbuf += c; + } + + break; + + case State::IN_FIELD: + m_cursor++; + if ( c == m_terminator ) { + handle_crlf(c); + m_state = State::END_OF_ROW; + return Field(m_fieldbuf); + } + + if ( c == m_delimiter ) { + m_state = State::START_OF_FIELD; + return Field(m_fieldbuf); + } else { + m_fieldbuf += c; + } + + break; + + case State::IN_QUOTED_FIELD: + m_cursor++; + if ( c == m_quote ) { + m_state = State::IN_ESCAPED_QUOTE; + } else { + m_fieldbuf += c; + } + + break; + + case State::IN_ESCAPED_QUOTE: + m_cursor++; + if ( c == m_terminator ) { + handle_crlf(c); + m_state = State::END_OF_ROW; + return Field(m_fieldbuf); + } + + if ( c == m_quote ) { + m_state = State::IN_QUOTED_FIELD; + m_fieldbuf += c; + } else if ( c == m_delimiter ) { + m_state = State::START_OF_FIELD; + return Field(m_fieldbuf); + } else { + m_state = State::IN_FIELD; + m_fieldbuf += c; + } + + break; + + case State::END_OF_ROW: + m_state = State::START_OF_FIELD; + return Field(FieldType::ROW_END); + + case State::EMPTY: + throw std::logic_error("You goofed"); + } + } + } +private: + // When the parser hits the end of a line it needs + // to check the special case of '\r\n' as a terminator. + // If it finds that the previous token was a '\r', and + // the next token will be a '\n', it skips the '\n'. + void handle_crlf(const char c) + { + if ( m_terminator != Term::CRLF || c != '\r' ) { + return; + } + + char *token = top_token(); + if ( token && *token == '\n' ) { + m_cursor++; + } + } + + // Pulls the next token from the input buffer, but does not move + // the cursor forward. If the stream is empty and the input buffer + // is also empty return a nullptr. + char *top_token() + { + // Return null if there's nothing left to read + if ( m_eof && m_cursor == m_inputbuf_size ) { + return nullptr; + } + + // Refill the input buffer if it's been fully read + if ( m_cursor == m_inputbuf_size ) { + m_scanposition += static_cast(m_cursor); + m_cursor = 0; + m_input.read(m_inputbuf, INPUTBUF_CAP); + + // Indicate we hit end of file, and resize + // input buffer to show that it's not at full capacity + if ( m_input.eof()) { + m_eof = true; + m_inputbuf_size = m_input.gcount(); + + // Return null if there's nothing left to read + if ( m_inputbuf_size == 0 ) { + return nullptr; + } + } + } + + return &m_inputbuf[m_cursor]; + } +public: + // Iterator implementation for the CSV parser, which reads + // from the CSV row by row in the form of a vector of strings + class iterator { + public: + using difference_type = std::ptrdiff_t; + using value_type = std::vector; + using pointer = const std::vector *; + using reference = const std::vector &; + using iterator_category = std::input_iterator_tag; + + explicit iterator(CsvParser *p, bool end = false) + : m_parser(p) + { + if ( !end ) { + m_row.reserve(50); + m_current_row = 0; + next(); + } + } + + iterator &operator++() + { + next(); + return *this; + } + + iterator operator++(int) + { + iterator i = (*this); + ++(*this); + return i; + } + + bool operator==(const iterator &other) const + { + return m_current_row == other.m_current_row + && m_row.size() == other.m_row.size(); + } + + bool operator!=(const iterator &other) const + { + return !(*this == other); + } + + reference operator*() const + { + return m_row; + } + + pointer operator->() const + { + return &m_row; + } + private: + value_type m_row{}; + CsvParser *m_parser; + int m_current_row = -1; + + void next() + { + value_type::size_type num_fields = 0; + for ( ;; ) { + auto field = m_parser->next_field(); + switch ( field.type ) { + case FieldType::CSV_END: + if ( num_fields < m_row.size()) { + m_row.resize(num_fields); + } + m_current_row = -1; + return; + case FieldType::ROW_END: + if ( num_fields < m_row.size()) { + m_row.resize(num_fields); + } + m_current_row++; + return; + case FieldType::DATA: + if ( num_fields < m_row.size()) { + m_row[num_fields] = std::move(*field.data); + } else { + m_row.push_back(std::move(*field.data)); + } + num_fields++; + } + } + } + }; + + iterator begin() { return iterator(this); }; + iterator end() { return iterator(this, true); }; +}; +} +} +#endif diff --git a/benchmarks/analyze_better_blocks/cengine/parser/Parser.cpp b/benchmarks/analyze_better_blocks/cengine/parser/Parser.cpp new file mode 100644 index 0000000..da50339 --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/parser/Parser.cpp @@ -0,0 +1,187 @@ +#include "Parser.hpp" +#include "Exceptions.hpp" +#include "Trim.hpp" +// ------------------------------------------------------------------------------------- +#include "gflags/gflags.h" +#include "CSVParser.hpp" +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +DEFINE_string(csv_separator, "|", "CSV separator."); +// ------------------------------------------------------------------------------------- +namespace cengine { +// ------------------------------------------------------------------------------------- +void Parser::parse(const string csv_path, const YAML::Node &schema, const string &out_dir) +{ + vector columns; + + // vector of vector for each type + vector> integer_vectors; + vector> double_vectors; + vector> string_vectors; + + if ( mkdir(out_dir.c_str(), S_IRWXU | S_IRWXG) && errno != EEXIST ) { + cerr << "creating output directory failed, status = " << errno << endl; + } + // read meta data and prepare columns + { + for ( const auto &column: schema["columns"] ) { + const string column_name = column["name"].as(); + const string column_type = column["type"].as(); + + cout << "column name = " << column_name << " , type = " << column_type << endl; + ColumnType type = ColumnType::UNDEFINED; + u32 vector_offset = 0; + + // ugly code + if ( column_type == "integer" || column_type == "smallint") { + type = ColumnType::INTEGER; + integer_vectors.push_back({}); + vector_offset = integer_vectors.size() - 1; + } else if ( column_type == "double" || column_type == "float") { + type = ColumnType::DOUBLE; + double_vectors.push_back({}); + vector_offset = double_vectors.size() - 1; + } else if ( column_type == "string" ) { + type = ColumnType::STRING; + string_vectors.push_back({}); + vector_offset = string_vectors.size() - 1; + } else { + type = ColumnType::SKIP; + } + columns.push_back({column_name, type, vector_offset, {}}); + } + } + // read csv data and fill the columns + { + std::ifstream csv_stream(csv_path.c_str()); + aria::csv::CsvParser parser = aria::csv::CsvParser(csv_stream) + .delimiter(*FLAGS_csv_separator.c_str()) + .terminator('\n'); + + u32 tuple_i = 0; + //u32 column_count = parser.begin()->size(); + for ( auto &tuple : parser ) { +// if(tuple.size() != column_count) { +// cerr << " lines " << tuple_i << " in csv is corrupted"<< endl; +// cerr << tuple[tuple.size() - 1] << endl; +// cerr << "will ignore this line" << endl; +// continue; +// } + tuple_i++; + u32 col_i = 0; + string column_debug_str; // for debugging + try { + for ( auto &column_csv_str: tuple ) { + auto &column_descriptor = columns[col_i++]; + if ( column_descriptor.column_type == ColumnType::SKIP ) // TODO: Reset + continue; + + string column_str; + if ( column_descriptor.column_type != ColumnType::STRING ) { + column_str = trim_copy(column_csv_str); + if ( column_str.size() != column_csv_str.size()) { + cout << "WARNING: trimmed col = '" << column_str << "' and untrimmed = '" << column_csv_str << "'" << endl; + } + } else { + column_str = column_csv_str; + } + column_debug_str = column_str; + switch ( column_descriptor.column_type ) { + case ColumnType::INTEGER: { + const bool is_set = (column_str.size() == 0 || column_str == "null") ? 0 : 1; + column_descriptor.set_bitmap.push_back(is_set); + // ------------------------------------------------------------------------------------- + const INTEGER value = (is_set ? std::stoi(column_str) : NULL_CODE); + die_if(is_set || value == NULL_CODE); + integer_vectors[column_descriptor.vector_offset].push_back(value); + // ------------------------------------------------------------------------------------- + // Update stats + column_descriptor.null_count += !is_set; + column_descriptor.empty_count += (value == 0) ? 1 : 0; + break; + } + case ColumnType::DOUBLE: { + const bool is_set = (column_str.size() == 0 || column_str == "null") ? 0 : 1; + column_descriptor.set_bitmap.push_back(is_set); + // ------------------------------------------------------------------------------------- + DOUBLE value = (is_set ? std::stod(column_str) : NULL_CODE); + double_vectors[column_descriptor.vector_offset].push_back(value); + // ------------------------------------------------------------------------------------- + // Update stats + column_descriptor.null_count += !is_set; + column_descriptor.empty_count += (value == 0) ? 1 : 0; + break; + } + case ColumnType::STRING: { + const bool is_set = (column_str == "null") ? 0 : 1; + column_descriptor.set_bitmap.push_back(is_set); + // ------------------------------------------------------------------------------------- + string_vectors[column_descriptor.vector_offset].push_back(is_set ? column_str : ""); + // ------------------------------------------------------------------------------------- + // Update stats + column_descriptor.null_count += !is_set; + column_descriptor.empty_count += (column_str.size() == 0) ? 1 : 0; + break; + } + default: + UNREACHABLE(); + } + } + } catch ( std::exception e ) { + cout << "exception thrown during parsing tuple_i = " << tuple_i << " column col_i = " << col_i << " name =" << columns[col_i].name + << " type = " << ConvertTypeToString(columns[col_i].column_type) << + " with column = " << column_debug_str << endl; + cout << "DUMP TUPLE" << endl; + for ( u32 i = 0; i < tuple.size(); i++ ) { + cout << i << " : " << tuple[i] << endl; + } + cout << e.what() << endl; + exit(1); + } + } + } + // write data to binary + { + for ( u32 col_i = 0; col_i < columns.size(); col_i++ ) { + auto &column_descriptor = columns[col_i]; + if ( column_descriptor.column_type == ColumnType::SKIP ) + continue; + string output_column_file = out_dir + std::to_string(col_i + 1) + "_" + column_descriptor.name; + // ------------------------------------------------------------------------------------- + // Write Bitmap + const string output_column_bitmap_file = output_column_file + ".bitmap"; + writeBinary(output_column_bitmap_file.c_str(), column_descriptor.set_bitmap); + // ------------------------------------------------------------------------------------- + switch ( column_descriptor.column_type ) { + case ColumnType::INTEGER: { + output_column_file += ".integer"; + die_if(integer_vectors[column_descriptor.vector_offset].size() == column_descriptor.set_bitmap.size()); + writeBinary(output_column_file.c_str(), integer_vectors[column_descriptor.vector_offset]); + break; + } + case ColumnType::DOUBLE: { + output_column_file += ".double"; + writeBinary(output_column_file.c_str(), double_vectors[column_descriptor.vector_offset]); + break; + } + case ColumnType::STRING: { + output_column_file += ".string"; + writeBinary(output_column_file.c_str(), string_vectors[column_descriptor.vector_offset]); + break; + } + default: + UNREACHABLE(); + } + // ------------------------------------------------------------------------------------- + // Write Stats + output_column_file += ".stats"; + std::ofstream stats_file(output_column_file); + stats_file << "null_count|empty_count\n"; + stats_file << column_descriptor.null_count << "|" << column_descriptor.empty_count << endl; + stats_file.close(); + } + } +} +// ------------------------------------------------------------------------------------- +} +// ------------------------------------------------------------------------------------- diff --git a/benchmarks/analyze_better_blocks/cengine/parser/Parser.hpp b/benchmarks/analyze_better_blocks/cengine/parser/Parser.hpp new file mode 100644 index 0000000..70043be --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/parser/Parser.hpp @@ -0,0 +1,25 @@ +#pragma once +#include "Units.hpp" +#include "MMapvector.hpp" +#include "Exceptions.hpp" +// ------------------------------------------------------------------------------------- +#include "yaml-cpp/yaml.h" +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +namespace cengine { +// ------------------------------------------------------------------------------------- +class Parser { +public: + struct ColumnDescriptor { + string name; + ColumnType column_type; + u32 vector_offset; + vector set_bitmap; + u32 null_count = 0; // when 'null' comes in the input + u32 empty_count = 0; // 0 by double and integers, '' by strings + }; + static void parse(const string csv_path, const YAML::Node &, const string &); +}; +// ------------------------------------------------------------------------------------- +} + diff --git a/benchmarks/analyze_better_blocks/cengine/parser/Trim.hpp b/benchmarks/analyze_better_blocks/cengine/parser/Trim.hpp new file mode 100644 index 0000000..52fd110 --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/parser/Trim.hpp @@ -0,0 +1,48 @@ +#pragma once +#include +#include +#include + +// trim from start (in place) +static inline void ltrim(std::string &s) +{ + s.erase(s.begin(), std::find_if(s.begin(), s.end(), [](int ch) { + return !std::isspace(ch); + })); +} + +// trim from end (in place) +static inline void rtrim(std::string &s) +{ + s.erase(std::find_if(s.rbegin(), s.rend(), [](int ch) { + return !std::isspace(ch); + }).base(), s.end()); +} + +// trim from both ends (in place) +static inline void trim(std::string &s) +{ + ltrim(s); + rtrim(s); +} + +// trim from start (copying) +static inline std::string ltrim_copy(std::string s) +{ + ltrim(s); + return s; +} + +// trim from end (copying) +static inline std::string rtrim_copy(std::string s) +{ + rtrim(s); + return s; +} + +// trim from both ends (copying) +static inline std::string trim_copy(std::string s) +{ + trim(s); + return s; +} \ No newline at end of file diff --git a/benchmarks/analyze_better_blocks/cengine/storage/Chunk.cpp b/benchmarks/analyze_better_blocks/cengine/storage/Chunk.cpp new file mode 100644 index 0000000..3cd5094 --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/storage/Chunk.cpp @@ -0,0 +1,274 @@ +#include "Chunk.hpp" +#include "Exceptions.hpp" +// ------------------------------------------------------------------------------------- +#include "gflags/gflags.h" +#include "utils/Utils.hpp" +// ------------------------------------------------------------------------------------- +#include +#include +#include +#include +// ------------------------------------------------------------------------------------- +DEFINE_uint32(part_size_threshold, 16 * 1024 * 1024, ""); +namespace cengine { +// ------------------------------------------------------------------------------------- + void Chunk::reset() { + for (u32 col_i = 0; col_i < relation.columns.size(); col_i++) { + columns[col_i].reset(); + } + sizes.reset(); + columns.reset(); + } + +// ------------------------------------------------------------------------------------- + bool Chunk::operator==(const cengine::Chunk &other) const { + // offset_from_relation is not relevant here + if (tuple_count != other.tuple_count) + return false; + if (relation.columns.size() != other.relation.columns.size()) + return false; + + for (u32 column_i = 0; column_i < relation.columns.size(); column_i++) { + if (!column_requires_copy[column_i] && sizes[column_i] != other.sizes[column_i]) { + cerr << "== : sizes in column " << column_i << " are not identical: " << sizes[column_i] << " vs. " + << other.sizes[column_i] << endl; + return false; + } + + if ((bitmaps[column_i] && other.bitmaps[column_i]) && + (std::memcmp(bitmaps[column_i].get(), other.bitmaps[column_i].get(), sizeof(BITMAP) * tuple_count) != + 0)) { + cerr << "== : bitmaps in column " << column_i << " are not identical" << endl; + return false; + } + + + for (u32 row_i = 0; row_i < tuple_count; row_i++) { + if (bitmaps[column_i][row_i]) { + switch (relation.columns[column_i].type) { + case ColumnType::INTEGER: { + auto me = reinterpret_cast(columns[column_i].get())[row_i]; + auto they = reinterpret_cast(other.columns[column_i].get())[row_i]; + if (me != they) { + cerr << "== : INTEGER column (" << relation.columns[column_i].name + << ") data are not identical\t" << "row_i = " << row_i << endl << me << endl + << they << endl; + return false; + } + break; + } + case ColumnType::DOUBLE: { + auto me = reinterpret_cast(columns[column_i].get())[row_i]; + auto they = reinterpret_cast(other.columns[column_i].get())[row_i]; + if (me != they) { + cerr << std::setprecision(30) << endl; + cerr << "== : DOUBLE column (" << relation.columns[column_i].name + << ") data are not identical\t" << "row_i = " << row_i << endl << me << endl + << they << endl; + return false; + } + break; + } + case ColumnType::STRING: { + auto me = this->operator()(column_i, row_i); + auto they = other.operator()(column_i, row_i); + if (me.length() != they.length() || std::memcmp(me.data(), they.data(), me.length()) != 0) { + cerr << "== : STRING column (" << relation.columns[column_i].name + << ") data are not identical\t" << "row_i = " << row_i << endl; + cerr << "me_size = \t" << me.length() << " - " << me << endl; + cerr << "they_size = \t" << they.length() << " - " << they << endl; + return false; + } + break; + } + } + } + } + + } + return true; + } + +// ------------------------------------------------------------------------------------- + Chunk::Chunk(unique_ptr[]> &&columns, unique_ptr[]> &&bitmaps, + unique_ptr &&column_requires_copy, u64 tuple_count, const Relation &relation, + unique_ptr &&sizes) + : relation(relation), columns(std::move(columns)), bitmaps(std::move(bitmaps)), + column_requires_copy(std::move(column_requires_copy)), sizes(std::move(sizes)), + tuple_count(tuple_count) {} + + Chunk::Chunk(unique_ptr[]> &&columns, unique_ptr[]> &&bitmaps, + u64 tuple_count, const Relation &relation, unique_ptr &&sizes) + : relation(relation), columns(std::move(columns)), bitmaps(std::move(bitmaps)), sizes(std::move(sizes)), + tuple_count(tuple_count) { + this->column_requires_copy = std::unique_ptr(new bool[tuple_count]); + for (u64 idx = 0; idx < tuple_count; idx++) { + column_requires_copy[idx] = false; + } + } + +// ------------------------------------------------------------------------------------- + InputChunk::InputChunk(unique_ptr &&data, unique_ptr &&bitmap, ColumnType type, u64 tuple_count, + SIZE size) + : data(std::move(data)), + nullmap(std::move(bitmap)), + type(type), + size(size), + tuple_count(tuple_count) {} + + bool InputChunk::compareContents(u8 *their_data, BitmapWrapper *bitmapWrapper, u64 their_tuple_count, + bool requires_copy) const { + if (their_tuple_count != this->tuple_count) { + std::cerr << "Tuple count is not equal. Expected: " << this->tuple_count << ". Got: " << their_tuple_count << std::endl; + return false; + } + + std::vector their_bitmap(their_tuple_count); + bitmapWrapper->writeBITMAP(their_bitmap.data()); + for (u64 idx = 0; idx < their_tuple_count; idx++) { + if (this->nullmap[idx] != their_bitmap[idx]) { + std::cerr << "Bitmaps are not equal at index " << idx << ". Expected: " << this->nullmap[idx] + << " Got: " << their_bitmap[idx] << std::endl; + return false; + } + } + + switch (this->type) { + case ColumnType::INTEGER: { + if (requires_copy) { + throw Generic_Exception("requires_copy not implemented for type INTEGER"); + } + + auto their_ints = reinterpret_cast(their_data); + auto my_ints = reinterpret_cast(this->data.get()); + for (u64 idx = 0; idx < their_tuple_count; idx++) { + if (this->nullmap[idx] && my_ints[idx] != their_ints[idx]) { + std::cerr << "Integer data is not equal at index " << idx << " Expected: " << my_ints[idx] + << " Got: " << their_ints[idx] << std::endl; + return false; + } + } + break; + } + case ColumnType::DOUBLE: { + if (requires_copy) { + throw Generic_Exception("requires_copy not implemented for type DOUBLE"); + } + + auto their_doubles = reinterpret_cast(their_data); + auto my_doubles = reinterpret_cast(this->data.get()); + for (u64 idx = 0; idx < their_tuple_count; idx++) { + if (this->nullmap[idx] && my_doubles[idx] != their_doubles[idx]) { + std::cerr << "Double data is not equal at index " << idx << std::setprecision(1000) + << " Expected: " << my_doubles[idx] + << " Got: " << their_doubles[idx] << std::endl; + return false; + } + } + break; + } + case ColumnType::STRING: { + auto my_view = cengine::StringArrayViewer(this->data.get()); + for (u64 idx = 0; idx < their_tuple_count; idx++) { + if (!this->nullmap[idx]) { + continue; + } + + str my_str = my_view(idx); + str their_str; + if (requires_copy) { + auto their_view = cengine::StringPointerArrayViewer(their_data); + their_str = their_view(idx); + } else { + auto their_view = cengine::StringArrayViewer(their_data); + their_str = their_view(idx); + } + + if (my_str.length() != their_str.length()) { + std::cerr << "String lengths are not equal at index " << idx << " Expected: " << my_str << " " + << my_str.length() << " Got: " << their_str << " " << their_str.length() << std::endl; + return false; + } + + if (my_str != their_str) { + std::cerr << "Strings are not equal at index " << idx << " Expected: " << my_str << " Got: " + << their_str << std::endl; + return false; + } + } + break; + } + default: + throw Generic_Exception("Type not implemented"); + } + return true; + } + + bool ColumnPart::canAdd(SIZE chunk_size) { + if (chunk_size > FLAGS_part_size_threshold) { + // This may appear in practice, but we ignore the problem for now. + // Although writing will work, reading the data back in may break, as we assume objects to always have a + // maximum size of part_size_threshold. + throw std::logic_error("chunks with compressed size greater than part_size_threshold unsupported"); + } + return (total_size + chunk_size) <= FLAGS_part_size_threshold; + } + + void ColumnPart::addCompressedChunk(vector &&chunk) { + total_size += chunk.size(); + chunks.push_back(chunk); + } + + u32 ColumnPart::writeToDisk(const std::string &outputfile) { + std::ofstream btr_file(outputfile, std::ios::out | std::ios::binary); + if (!btr_file.good()) { + perror(outputfile.c_str()); + throw Generic_Exception("Opening btr output file failed"); + } + + struct ColumnPartMetadata metadata{ + .num_chunks = static_cast(this->chunks.size()) + }; + + // We need to align the offsets by 16. Otherwise, PBP decompression breaks. + u32 current_offset = sizeof(metadata) + this->chunks.size() * sizeof(u32); + u64 diff; + current_offset = Utils::alignBy(current_offset, 16, diff); + std::vector offsets; + std::vector diffs; + for (const auto &chunk: this->chunks) { + offsets.push_back(current_offset); + diffs.push_back(diff); + current_offset += chunk.size(); + current_offset = Utils::alignBy(current_offset, 16, diff); + } + + // Write metadata + btr_file.write(reinterpret_cast(&metadata), sizeof(metadata)); + // Write offsets + btr_file.write(reinterpret_cast(offsets.data()), offsets.size() * sizeof(u32)); + // Write chunks + for (std::size_t chunk_i = 0; chunk_i < chunks.size(); chunk_i++) { + auto &chunk = this->chunks[chunk_i]; + btr_file.seekp(diffs[chunk_i], std::ios::cur); + btr_file.write(reinterpret_cast(chunk.data()), chunk.size()); + } + + u32 bytes_written = this->total_size + sizeof(metadata) + offsets.size() * sizeof(u32); + btr_file.flush(); + btr_file.close(); + if (btr_file.fail()) { + perror(outputfile.c_str()); + throw Generic_Exception("Closing btr file failed"); + } + this->reset(); + + return bytes_written; + } + + void ColumnPart::reset() { + this->total_size = 0; + this->chunks.clear(); + } +} +// ------------------------------------------------------------------------------------- diff --git a/benchmarks/analyze_better_blocks/cengine/storage/Chunk.hpp b/benchmarks/analyze_better_blocks/cengine/storage/Chunk.hpp new file mode 100644 index 0000000..1b1dafa --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/storage/Chunk.hpp @@ -0,0 +1,106 @@ +#pragma once +#include "Units.hpp" +#include "Column.hpp" +#include "Relation.hpp" +#include "StringArrayViewer.hpp" +#include "StringPointerArrayViewer.hpp" +#include "datablock/schemes/v2/bitmap/RoaringBitmap.hpp" +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +#include +// ------------------------------------------------------------------------------------- +namespace cengine { +class Relation; +// ------------------------------------------------------------------------------------- +class Chunk { +public: + const Relation &relation; + unique_ptr[]> columns; + unique_ptr[]> bitmaps; + unique_ptr column_requires_copy; + unique_ptr sizes; + const u64 tuple_count; + + Chunk(unique_ptr[]> &&columns, unique_ptr[]> &&bitmaps, + unique_ptr &&column_requires_copy, u64 tuple_count, const Relation &relation, + unique_ptr &&sizes); + + Chunk(unique_ptr[]> &&columns, unique_ptr[]> &&bitmaps, + u64 tuple_count, const Relation &relation, unique_ptr &&sizes); + + template + inline T &get(u32 c, u32 i) const + { + return reinterpret_cast(columns[c].get())[i]; + } + + inline BITMAP has(u32 c, u32 i) const + { + return bitmaps.get()[c].get()[i]; + } + + inline const str operator()(u32 c, u32 i) const + { + if (column_requires_copy[c]) { + StringPointerArrayViewer viewer(columns[c].get()); + return viewer(i); + } else { + return StringArrayViewer::get(columns[c].get(), i); + } + } + + template + inline const T *array(u32 c) const + { + return reinterpret_cast(columns[c].get()); + } + + inline const BITMAP *nullmap(u32 c) const + { + return bitmaps.get()[c].get(); + } + + inline SIZE size(u32 c) const { return sizes[c]; } + + bool operator==(const Chunk &other) const; + + void reset(); +}; + +class InputChunk { + /* + * Chunk of a single column before compression + */ +public: + unique_ptr data; + unique_ptr nullmap; + ColumnType type; + SIZE size; + const u64 tuple_count; + + InputChunk(unique_ptr &&data, unique_ptr &&bitmap, ColumnType type, u64 tuple_count, SIZE size); + + bool compareContents(u8 *their_data, BitmapWrapper *bitmapWrapper, u64 their_tuple_count, bool requires_copy) const; +}; + +struct ColumnPartMetadata { + u32 num_chunks; + u32 offsets[]; +}; + +class ColumnPart { + /* + * Multiple compressed chunks of a single column, so they have at minimum the target file size + */ +public: + SIZE total_size = 0; + vector> chunks; + + [[nodiscard]] bool canAdd(SIZE chunk_size); + void addCompressedChunk(vector &&chunk); + u32 writeToDisk(const std::string& outputfile); + void reset(); +}; +// ------------------------------------------------------------------------------------- +} +// ------------------------------------------------------------------------------------- diff --git a/benchmarks/analyze_better_blocks/cengine/storage/Column.cpp b/benchmarks/analyze_better_blocks/cengine/storage/Column.cpp new file mode 100644 index 0000000..f865126 --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/storage/Column.cpp @@ -0,0 +1,66 @@ +#include "Column.hpp" +#include "Exceptions.hpp" +// ------------------------------------------------------------------------------------- +namespace cengine { +// ------------------------------------------------------------------------------------- +Column::Column(const ColumnType type, const string name, const string data_path, const string bitmap_path) + : type(type) + , name(name) +{ + switch ( type ) { + case ColumnType::INTEGER: + data.emplace<0>(data_path.c_str()); + break; + case ColumnType::DOUBLE: + data.emplace<1>(data_path.c_str()); + break; + case ColumnType::STRING: + data.emplace<2>(data_path.c_str()); + break; + default: + UNREACHABLE(); + break; + } + bitmap.readBinary(bitmap_path.c_str()); +} +// ------------------------------------------------------------------------------------- +const Vector &Column::integers() const +{ + return std::get<0>(data); +} +// ------------------------------------------------------------------------------------- +const Vector &Column::doubles() const +{ + return std::get<1>(data); +} +// ------------------------------------------------------------------------------------- +const Vector &Column::strings() const +{ + return std::get<2>(data); +} +// ------------------------------------------------------------------------------------- +const Vector &Column::bitmaps() const +{ + return bitmap; +} +// ------------------------------------------------------------------------------------- +SIZE Column::sizeInBytes() const +{ + switch ( type ) { + case ColumnType::INTEGER: + return integers().size() * sizeof(INTEGER); + break; + case ColumnType::DOUBLE: + return doubles().size() * sizeof(DOUBLE); + break; + case ColumnType::STRING: + return strings().fileSize; + break; + default: + UNREACHABLE(); + break; + } +} +} +// ------------------------------------------------------------------------------------- + diff --git a/benchmarks/analyze_better_blocks/cengine/storage/Column.hpp b/benchmarks/analyze_better_blocks/cengine/storage/Column.hpp new file mode 100644 index 0000000..da35782 --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/storage/Column.hpp @@ -0,0 +1,26 @@ +#pragma once +#include "Units.hpp" +#include "MMapvector.hpp" +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +#include +// ------------------------------------------------------------------------------------- +namespace cengine { +// ------------------------------------------------------------------------------------- +class Column { +public: + const ColumnType type; + const string name; + std::variant, Vector, Vector> data; + Vector bitmap; + + Column(const ColumnType type, const string name, const string data_path, const string bitmap_path); + const Vector &integers() const; + const Vector &doubles() const; + const Vector &strings() const; + const Vector &bitmaps() const; + SIZE sizeInBytes() const; +}; +// ------------------------------------------------------------------------------------- +} +// ------------------------------------------------------------------------------------- diff --git a/benchmarks/analyze_better_blocks/cengine/storage/Relation.cpp b/benchmarks/analyze_better_blocks/cengine/storage/Relation.cpp new file mode 100644 index 0000000..04b6fbe --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/storage/Relation.cpp @@ -0,0 +1,224 @@ +#include "Relation.hpp" +#include "Exceptions.hpp" +#include "StringArrayViewer.hpp" +// ------------------------------------------------------------------------------------- +#include "gflags/gflags.h" +#include "tbb/parallel_for.h" +// ------------------------------------------------------------------------------------- +#include +#include +// ------------------------------------------------------------------------------------- +DEFINE_uint64(block_size, 65536, "Block(Chunk) size"); +DEFINE_string(only_type, "", ""); +// ------------------------------------------------------------------------------------- +namespace cengine { +// ------------------------------------------------------------------------------------- +Relation::Relation() +{ + columns.reserve(100); // Dirty fix: somehow columns Vector get destroyed and never correctly restored during vector resize +} +// ------------------------------------------------------------------------------------- +Relation::Relation(const YAML::Node &schema, const string &columns_dir) +{ + columns.reserve(schema["columns"].size()); + const auto &columns = schema["columns"]; + for ( u32 column_i = 0; column_i < columns.size(); column_i++ ) { + const auto &column = columns[column_i]; + const string column_name = column["name"].as(); + string column_type = column["type"].as(); + if (column_type == "smallint") { + column_type = "integer"; + } else if (column_type == "float") { + column_type = "double"; + } + // ------------------------------------------------------------------------------------- + if ( FLAGS_only_type != "" ) { + if ( column_type != FLAGS_only_type ) { + continue; + } + } + // ------------------------------------------------------------------------------------- + const string column_file_prefix = columns_dir + std::to_string(column_i + 1) + "_" + column_name; + const string column_file_path = column_file_prefix + "." + column_type; + if ( column_type == "integer" || column_type == "double" || column_type == "string" ) { + addColumn(column_file_path); + } + } +} +// ------------------------------------------------------------------------------------- +void Relation::addColumn(const string column_file_path) +{ + std::string column_name, column_type_str; + std::regex re(".*\\/(.*)\\.(\\w*)"); + std::smatch match; + if ( std::regex_search(column_file_path, match, re) && match.size() > 1 ) { + column_name = match.str(1); + column_type_str = match.str(2); + ColumnType column_type = ConvertStringToType(column_type_str); + + string bitmap_file_path = column_file_path; + bitmap_file_path = bitmap_file_path.replace(bitmap_file_path.end() - column_type_str.length(), bitmap_file_path.end(), "bitmap"); + columns.emplace_back(column_type, column_name, column_file_path, bitmap_file_path); + } else { + UNREACHABLE(); + } + fixTupleCount(); +} +// ------------------------------------------------------------------------------------- +// This could as well just return vector +vector> Relation::getRanges(cengine::SplitStrategy strategy, u32 max_chunk_count) const +{ + // ------------------------------------------------------------------------------------- + // Build all possible ranges + vector> ranges; // (start_index, length) + for ( u64 offset = 0; offset < tuple_count; offset += FLAGS_block_size ) { + // ------------------------------------------------------------------------------------- + u64 chunk_tuple_count; + if ( offset + FLAGS_block_size >= tuple_count ) { + chunk_tuple_count = tuple_count - offset; + } else { + chunk_tuple_count = FLAGS_block_size; + } + ranges.emplace_back(offset, chunk_tuple_count); + } + // ------------------------------------------------------------------------------------- + if ( strategy == SplitStrategy::RANDOM ) { + std::random_shuffle(ranges.begin(), ranges.end()); + cout << std::get<0>(ranges[0])<< endl; + } + // ------------------------------------------------------------------------------------- + if ( max_chunk_count ) { + ranges.resize(std::min(SIZE(max_chunk_count), ranges.size())); + } + return ranges; +} +// ------------------------------------------------------------------------------------- +Chunk Relation::getChunk(vector &ranges, SIZE chunk_i) const +{ + auto const &range = ranges[chunk_i]; + auto offset = std::get<0>(range); + auto chunk_tuple_count = std::get<1>(range); + // ------------------------------------------------------------------------------------- + auto c_columns = std::unique_ptr[]>(new std::unique_ptr[columns.size()]); + auto c_bitmaps = std::unique_ptr[]>(new std::unique_ptr[columns.size()]); + auto c_sizes = std::unique_ptr(new SIZE[columns.size()]); + // ------------------------------------------------------------------------------------- + for ( u32 i = 0; i < columns.size(); i++ ) { + // ------------------------------------------------------------------------------------- + c_bitmaps[i] = std::unique_ptr(new BITMAP[chunk_tuple_count * sizeof(BITMAP)]); + std::memcpy(reinterpret_cast(c_bitmaps[i].get()), columns[i].bitmaps().data + offset, chunk_tuple_count * sizeof(BITMAP)); + // ------------------------------------------------------------------------------------- + switch ( columns[i].type ) { + case ColumnType::INTEGER: { + c_sizes[i] = chunk_tuple_count * sizeof(INTEGER); + c_columns[i] = std::unique_ptr(new u8[c_sizes[i]]); + std::memcpy(reinterpret_cast(c_columns[i].get()), columns[i].integers().data + offset, chunk_tuple_count * sizeof(INTEGER)); + break; + } + case ColumnType::DOUBLE: { + c_sizes[i] = chunk_tuple_count * sizeof(DOUBLE); + c_columns[i] = std::unique_ptr(new u8[c_sizes[i]]); + std::memcpy(reinterpret_cast(c_columns[i].get()), columns[i].doubles().data + offset, chunk_tuple_count * sizeof(DOUBLE)); + break; + } + case ColumnType::STRING: { + const u64 slots_size = sizeof(StringArrayViewer::Slot) * (chunk_tuple_count + 1); + // ------------------------------------------------------------------------------------- + const StringIndexSlot *source_slots = columns[i].strings().data->slot; + + const u64 strings_size = (source_slots[offset + chunk_tuple_count - 1].offset - source_slots[offset].offset) + + source_slots[offset + chunk_tuple_count - 1].size; + // ------------------------------------------------------------------------------------- + c_sizes[i] = slots_size + strings_size; + c_columns[i] = std::unique_ptr(new u8[c_sizes[i]]); + // ------------------------------------------------------------------------------------- + auto dest_slots = reinterpret_cast(c_columns[i].get()); + // ------------------------------------------------------------------------------------- + const u64 bias = source_slots[offset].offset - slots_size; + for ( u64 slot_index = 0; slot_index < chunk_tuple_count; slot_index++ ) { + dest_slots[slot_index].offset = source_slots[slot_index + offset].offset - bias; + } + dest_slots[chunk_tuple_count].offset = c_sizes[i]; + // ------------------------------------------------------------------------------------- + // copy the strings + std::memcpy(reinterpret_cast(c_columns[i].get() + slots_size), columns[i].strings()[offset].data(), strings_size); + // ------------------------------------------------------------------------------------- + break; + } + default: + UNREACHABLE(); + break; + } + } + // ------------------------------------------------------------------------------------- + return Chunk(std::move(c_columns), std::move(c_bitmaps), chunk_tuple_count, *this, + std::move(c_sizes)); +} +// ------------------------------------------------------------------------------------- +InputChunk Relation::getInputChunk(Range &range, SIZE chunk_i, u32 column) const { + auto offset = std::get<0>(range); + auto chunk_tuple_count = std::get<1>(range); + + auto bitmap = std::unique_ptr(new BITMAP[chunk_tuple_count * sizeof(BITMAP)]); + std::memcpy(reinterpret_cast(bitmap.get()), columns[column].bitmaps().data + offset, chunk_tuple_count * sizeof(BITMAP)); + + SIZE size; + std::unique_ptr data; + switch ( columns[column].type ) { + case ColumnType::INTEGER: { + size = chunk_tuple_count * sizeof(INTEGER); + data = std::unique_ptr(new u8[size]); + std::memcpy(reinterpret_cast(data.get()), columns[column].integers().data + offset, chunk_tuple_count * sizeof(INTEGER)); + break; + } + case ColumnType::DOUBLE: { + size = chunk_tuple_count * sizeof(DOUBLE); + data = std::unique_ptr(new u8[size]); + std::memcpy(reinterpret_cast(data.get()), columns[column].doubles().data + offset, chunk_tuple_count * sizeof(DOUBLE)); + break; + } + case ColumnType::STRING: { + const u64 slots_size = sizeof(StringArrayViewer::Slot) * (chunk_tuple_count + 1); + // ------------------------------------------------------------------------------------- + const StringIndexSlot *source_slots = columns[column].strings().data->slot; + + const u64 strings_size = (source_slots[offset + chunk_tuple_count - 1].offset - source_slots[offset].offset) + + source_slots[offset + chunk_tuple_count - 1].size; + // ------------------------------------------------------------------------------------- + size = slots_size + strings_size; + data = std::unique_ptr(new u8[size]); + // ------------------------------------------------------------------------------------- + auto dest_slots = reinterpret_cast(data.get()); + // ------------------------------------------------------------------------------------- + const u64 bias = source_slots[offset].offset - slots_size; + for ( u64 slot_index = 0; slot_index < chunk_tuple_count; slot_index++ ) { + dest_slots[slot_index].offset = source_slots[slot_index + offset].offset - bias; + } + dest_slots[chunk_tuple_count].offset = size; + // ------------------------------------------------------------------------------------- + // copy the strings + std::memcpy(reinterpret_cast(data.get() + slots_size), columns[column].strings()[offset].data(), strings_size); + // ------------------------------------------------------------------------------------- + break; + } + default: + throw Generic_Exception("Type not implemented"); + } + + return {std::move(data), std::move(bitmap), columns[column].type, chunk_tuple_count, size}; +} +// ------------------------------------------------------------------------------------- +void Relation::fixTupleCount() +{ + if ( columns.size()) { + tuple_count = columns[0].bitmap.size(); + for ( auto &column : columns ) { + die_if(tuple_count == column.bitmap.size()); + } + } else { + tuple_count = 0; + } +} +// ------------------------------------------------------------------------------------- +} +// ------------------------------------------------------------------------------------- diff --git a/benchmarks/analyze_better_blocks/cengine/storage/Relation.hpp b/benchmarks/analyze_better_blocks/cengine/storage/Relation.hpp new file mode 100644 index 0000000..1071453 --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/storage/Relation.hpp @@ -0,0 +1,38 @@ +#pragma once +#include "Units.hpp" +#include "Column.hpp" +#include "Chunk.hpp" +// ------------------------------------------------------------------------------------- +#include "yaml-cpp/yaml.h" +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +namespace cengine { +// ------------------------------------------------------------------------------------- +enum class SplitStrategy : u8 { + SEQUENTIAL, + RANDOM +}; +// ------------------------------------------------------------------------------------- +class Chunk; +class InputChunk; +// ------------------------------------------------------------------------------------- +using Range = tuple; +// ------------------------------------------------------------------------------------- +class Relation { +public: + string name; + u64 tuple_count; + vector columns; + void collectStatistics(); //TODO + Relation(const YAML::Node &schema, const string &columns_dir); + Relation(); + vector getRanges(cengine::SplitStrategy strategy, u32 max_chunk_count) const; + Chunk getChunk(vector &ranges, SIZE chunk_i) const; + InputChunk getInputChunk(Range &range, SIZE chunk_i, u32 column) const; + void addColumn(const string column_file_path); +private: + void fixTupleCount(); +}; +// ------------------------------------------------------------------------------------- +} +// ------------------------------------------------------------------------------------- diff --git a/benchmarks/analyze_better_blocks/cengine/storage/StringArrayViewer.hpp b/benchmarks/analyze_better_blocks/cengine/storage/StringArrayViewer.hpp new file mode 100644 index 0000000..8c6e747 --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/storage/StringArrayViewer.hpp @@ -0,0 +1,44 @@ +#pragma once +#include "Units.hpp" +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +namespace cengine { +// ------------------------------------------------------------------------------------- +// just a view +struct StringArrayViewer { + struct Slot { + INTEGER offset; + }; + const u8 *slots_ptr; + + StringArrayViewer(const u8 *slots_ptr) + : slots_ptr(slots_ptr) {} + + inline static const str get(const u8 *slots_ptr, u32 i) + { + auto slots = reinterpret_cast(slots_ptr); + return str(reinterpret_cast(reinterpret_cast(slots_ptr) + slots[i].offset), slots[i + 1].offset - slots[i].offset); + } + + inline u32 size(u32 i) const + { + auto slots = reinterpret_cast(slots_ptr); + return slots[i + 1].offset - slots[i].offset; + } + + inline const str operator()(u32 i) const + { + auto slots = reinterpret_cast(slots_ptr); + const u32 str_length = slots[i + 1].offset - slots[i].offset; + return str(reinterpret_cast(slots_ptr + slots[i].offset), str_length); + } + + inline const char *get_pointer(u32 i) const + { + auto slots = reinterpret_cast(slots_ptr); + return reinterpret_cast(slots_ptr + slots[i].offset); + } +}; +// ------------------------------------------------------------------------------------- +} \ No newline at end of file diff --git a/benchmarks/analyze_better_blocks/cengine/storage/StringPointerArrayViewer.hpp b/benchmarks/analyze_better_blocks/cengine/storage/StringPointerArrayViewer.hpp new file mode 100644 index 0000000..a5d8c18 --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/storage/StringPointerArrayViewer.hpp @@ -0,0 +1,27 @@ +#pragma once + +#include "Units.hpp" + +/* + * This supports a minimal interface. In theory we could unify this interface and the interface of StringArrayViewer + */ +namespace cengine { + struct StringPointerArrayViewer { + struct View { + u32 length; + u32 offset; + }; + static_assert(sizeof(View) == 8); + const View* views; + + explicit StringPointerArrayViewer(const u8 *data) { + this->views = reinterpret_cast(data); + } + + inline str operator()(u32 i) const + { + return {reinterpret_cast(this->views) + views[i].offset, views[i].length}; + } + }; +// ------------------------------------------------------------------------------------- +} \ No newline at end of file diff --git a/benchmarks/analyze_better_blocks/cengine/utils/Utils.cpp b/benchmarks/analyze_better_blocks/cengine/utils/Utils.cpp new file mode 100644 index 0000000..b96b82a --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/utils/Utils.cpp @@ -0,0 +1,24 @@ +#include "Utils.hpp" +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +#include +namespace cengine { +u32 Utils::getBitsNeeded(u32 input) +{ + return std::max(std::floor(std::log2(input)) + 1, 1.0); +} +// ------------------------------------------------------------------------------------- +u32 Utils::getBitsNeeded(u64 input) +{ + return std::max(std::floor(std::log2(input)) + 1, 1.0); +} +// ------------------------------------------------------------------------------------- +u32 Utils::getBitsNeeded(s32 input) +{ + if (input < 0) { + return 32; + } + return std::max(std::floor(std::log2(input)) + 1, 1.0); +} +} \ No newline at end of file diff --git a/benchmarks/analyze_better_blocks/cengine/utils/Utils.hpp b/benchmarks/analyze_better_blocks/cengine/utils/Utils.hpp new file mode 100644 index 0000000..e4138f7 --- /dev/null +++ b/benchmarks/analyze_better_blocks/cengine/utils/Utils.hpp @@ -0,0 +1,126 @@ +#pragma once + +#include +#include +#include "Units.hpp" +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +namespace cengine { +// ------------------------------------------------------------------------------------- +class Utils { +public: + static inline u64 alignBy(u64 num, u64 alignment, u64 &diff) { + u64 new_num = (num + alignment-1) & ~(alignment-1); + diff = new_num - num; + return new_num; + }; + static u32 getBitsNeeded(u32); + static u32 getBitsNeeded(u64); + static u32 getBitsNeeded(s32); + static inline void multiplyString(char *dest, const char *src, u32 len, u32 n, u32 src_n) { + // IDEA: + // Move this check out of here and only perform it when these cases can actually occur + if (n == 0 || len == 0) { + return; + } + + u32 n_written = std::min(n, src_n); + std::memcpy(dest, src, len * n_written); + char *write_ptr = dest + (len * n_written); + + while (n_written < n) { + u32 write_n = std::min(n_written, n - n_written); + std::size_t write_len = write_n * len; + + std::memcpy(write_ptr, dest, write_len); + + write_ptr += write_len; + n_written += write_n; + } + } + + static inline void multiplyU32(u32 *dest, const u32 *src, u32 n) { + multiplyString(reinterpret_cast(dest), reinterpret_cast(src), sizeof(u32), n, 1); + } + + static inline u32 *writeOffsetsU32(u32 *dest, u32 start, u32 len, u32 n) { + /* + * Writes u32 bits offsets for string viewer. The first offset will be at start and will increment by length. + * A total of at least n offsets will be written. + * + * After the operation the memory should look like this + * dest[0] = start + * dest[1] = start + length + * dest[2] = start + 2*length + * ... + * dest[n-1] = start + (n-1) * length + * + * WARNING: This may write up to 8 additional offsets past the given array. Make sure to properly allocate space + * and to not overwrite other existing data in that additional space. + */ + static_assert(sizeof(u32) == 4); + + // First 8 writes sequentially unrolled + dest[0] = start; + dest[1] = start + len; + dest[2] = start + 2 * len; + dest[3] = start + 3 * len; + dest[4] = start + 4 * len; + dest[5] = start + 5 * len; + dest[6] = start + 6 * len; + dest[7] = start + 7 * len; + + if (n <= 8) { + return dest + n; + } + + // IDEA: + // Could maybe improve performance further by + // - moving the loadu_si256 + // - replacing the loadu_ and the first 8 elements by a write to a vector register and a storu +#ifdef BTR_USE_SIMD + auto write_ptr = dest + 8; + auto *end = dest + n; + const __m256i len_v = _mm256_set1_epi32(len*8); + __m256i current = _mm256_loadu_si256(reinterpret_cast<__m256i *>(dest)); + while (write_ptr < end) { + current = _mm256_add_epi32(current, len_v); + _mm256_storeu_si256(reinterpret_cast<__m256i *>(write_ptr), current); + write_ptr += 8; + } + return end; +#else + for (auto i = 8u; i != n; ++i) { + // could change multiplication to addition addition, but adds data dependency. + // let's trust the compiler to do the right thing. + dest[i] = start + i * len; + } + return dest + n; +#endif + } + + static inline void readFileToMemory(const std::string &path, std::vector &target) { + std::ifstream file(path, std::ios::binary | std::ios::ate); + if (!file.good()) { + auto msg = "Failed to open " + path; + perror(msg.c_str()); + throw Generic_Exception(msg); + } + std::streamsize filesize = file.tellg(); + file.seekg(0, std::ios::beg); + + target.resize(filesize); + file.read(target.data(), filesize); + + file.close(); + if (file.fail()) { + auto msg = "Reading " + path + " failed"; + perror(msg.c_str()); + throw Generic_Exception(msg); + } + } +}; +// ------------------------------------------------------------------------------------- +} +// ------------------------------------------------------------------------------------- diff --git a/benchmarks/analyze_better_blocks/cmake/.gitkeep b/benchmarks/analyze_better_blocks/cmake/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/cmake/clang-tidy.cmake b/benchmarks/analyze_better_blocks/cmake/clang-tidy.cmake new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/codestyle.xml b/benchmarks/analyze_better_blocks/codestyle.xml new file mode 100644 index 0000000..68ef8a5 --- /dev/null +++ b/benchmarks/analyze_better_blocks/codestyle.xml @@ -0,0 +1,58 @@ + + + diff --git a/benchmarks/analyze_better_blocks/harbook.cpp b/benchmarks/analyze_better_blocks/harbook.cpp new file mode 100644 index 0000000..a0676e1 --- /dev/null +++ b/benchmarks/analyze_better_blocks/harbook.cpp @@ -0,0 +1,329 @@ +// // #include "Units.hpp" +#include "MMapvector.hpp" +#include "Exceptions.hpp" +#include "parser/Parser.hpp" +#include "storage/Relation.hpp" +#include "datablock/Datablock.hpp" +#include "datablock/CMachine.hpp" +#include "datablock/schemes/CSchemePool.hpp" +#include "analysis/Analysis.hpp" +#include "extern/BZIP2.hpp" +#include "extern/LZ4.hpp" +// ------------------------------------------------------------------------------------- +#include "gflags/gflags.h" +#include "yaml-cpp/yaml.h" +#include "spdlog/spdlog.h" +#include "spdlog/sinks/rotating_file_sink.h" +#include "tbb/parallel_for.h" +#include "tbb/task_scheduler_init.h" +// ------------------------------------------------------------------------------------- +#include +#include +#include +#include +#include +#include +#include +#include +// ------------------------------------------------------------------------------------- +using namespace std; +// ------------------------------------------------------------------------------------- +DEFINE_string(out, "", "Output directory for parsed columns (binary format)"); +DEFINE_string(yaml, "", "Relation's descriptor file in YAML"); +// ------------------------------------------------------------------------------------- +DEFINE_bool(print_csv, false, ""); +DEFINE_bool(print_header, false, ""); +// ------------------------------------------------------------------------------------- +DEFINE_bool(analyze, false, ""); +DEFINE_bool(verify, false, ""); +DEFINE_bool(include_null_bitmap, false, ""); +DEFINE_bool(parse, false, "Parse the data before processing"); +DEFINE_bool(only_parse, false, "Stop after parsing"); +DEFINE_bool(print_chunk_sample, false, ""); +DEFINE_string(single_in, "", "Prase single column file only"); +DEFINE_uint32(chunks, 9999, "Limit the maximum number of processed relation chunks"); +DEFINE_uint32(split_strategy, 1, ""); +DEFINE_uint32(threads, 20, ""); +DEFINE_uint32(schemes, 12, ""); +// ------------------------------------------------------------------------------------- +DEFINE_bool(log_stdout, false, ""); +DEFINE_string(decision_tree, "", ""); +DEFINE_string(estimation_deviation, "", ""); +DEFINE_string(fsst_stats, "", ""); +// ------------------------------------------------------------------------------------- +DECLARE_bool(db1); +DECLARE_bool(db2); +DECLARE_uint32(sample_size); +DECLARE_uint32(sample_count); +DECLARE_uint64(block_size); +DEFINE_bool(bzip2, false, ""); +DEFINE_bool(lz4, false, ""); +// ------------------------------------------------------------------------------------- +DEFINE_string(variant, "latest", ""); +// ------------------------------------------------------------------------------------- +// Constants +const string KEY_RELATION_NAME = "r_name"; +const string KEY_COLUMN_NAME = "c_name"; +// ------------------------------------------------------------------------------------- +void signalHandler(int signum) +{ + cerr << "Interrupt signal (" << signum << ") received.\n"; + cerr << FLAGS_yaml << endl; + cerr << FLAGS_single_in << endl; + exit(signum); +} +int main(int argc, char **argv) +{ + // ------------------------------------------------------------------------------------- + signal(SIGINT, signalHandler); + signal(SIGTERM, signalHandler); + signal(SIGSEGV, signalHandler); + signal(SIGFPE, signalHandler); + signal(SIGABRT, signalHandler); + // ------------------------------------------------------------------------------------- + gflags::ParseCommandLineFlags(&argc, &argv, true); + // ------------------------------------------------------------------------------------- + cengine::db::CSchemePool::refresh(); + // ------------------------------------------------------------------------------------- + // Set the default logger to file logger + if ( !FLAGS_log_stdout ) { + auto file_logger = spdlog::rotating_logger_mt("main_logger", "log.txt", 1024 * 1024 * 10, 3); + spdlog::set_default_logger(file_logger);// change log pattern + } + spdlog::info("Started harbook with single_in = {}", FLAGS_single_in); + // ------------------------------------------------------------------------------------- + tbb::task_scheduler_init init(FLAGS_threads); + //tbb::task_scheduler_init init(tbb::task_scheduler_init::default_num_threads()); // Explicit number of threads + // ------------------------------------------------------------------------------------- + string schema_path = FLAGS_yaml; + string relation_name; + + cengine::Relation relation; + if ( FLAGS_single_in == "" ) { + const string out_dir = FLAGS_yaml.substr(0, FLAGS_yaml.length() - 5) + "/"; + string csv_path = FLAGS_yaml.replace(FLAGS_yaml.length() - 5, 5, ".csv"); + { + std::regex re("\\/([^\\/]+).(yaml)"); + std::smatch match; + if ( std::regex_search(schema_path, match, re) && match.size() > 1 ) { + relation_name = match.str(1); + } + } + const auto schema = YAML::LoadFile(schema_path); + if ( FLAGS_parse ) { + ifstream csv(csv_path); + if ( !csv.good()) { + throw Generic_Exception("Can not open csv file."); + } + cengine::Parser::parse(csv_path, schema, out_dir); + cout << "Done parsing." << endl; + } + if ( FLAGS_only_parse ) { + return 0; + } + // ------------------------------------------------------------------------------------- + // Create relation out of yaml schema + relation = cengine::Relation(schema, out_dir); + } else { + { + std::regex re("\\/([^\\/]+)(\\/[^\\/]+).(integer|double|string)"); + std::smatch match; + if ( std::regex_search(FLAGS_single_in, match, re) && match.size() > 1 ) { + relation_name = match.str(1); + } + } + relation.addColumn(FLAGS_single_in); + } + // ------------------------------------------------------------------------------------- + if ( relation.columns.size() == 0 ) { + return 0; + } + relation.name = relation_name; + // ------------------------------------------------------------------------------------- + auto ranges = relation.getRanges(static_cast(FLAGS_split_strategy), FLAGS_chunks); + vector> output_blocks; + cengine::db::Datablock datablockV2(relation); + // ------------------------------------------------------------------------------------- + // Poor man stats + srand(time(NULL)); + vector> results(relation.columns.size()); + // ------------------------------------------------------------------------------------- + if ( FLAGS_analyze ) { + results = cengine::analyzeRelation(relation); + } + // ------------------------------------------------------------------------------------- + if ((FLAGS_db1 & FLAGS_db2) || (!FLAGS_db1 && !FLAGS_db2)) { + throw Generic_Exception("You have to choose between db1 and db2"); + } + // ------------------------------------------------------------------------------------- + std::mutex compression_summary_mutex; + vector> schemes_frequency(relation.columns.size()); + vector db_metas; + vector col_tuplet_counts(relation.columns.size(), 0); + vector before_col_sizes(relation.columns.size(), 0); + vector after_col_sizes(relation.columns.size(), 0); + vector> chunk_sample(ranges.size(), {relation.columns.size(), ""}); + vector compressed_chunks; + compressed_chunks.resize(ranges.size()); + // ------------------------------------------------------------------------------------- + if ( ranges.size() == 0 ) { + cerr << "Warning: col_id = " << FLAGS_single_in << " is empty !" << endl; + } + // ------------------------------------------------------------------------------------- + // External compression + // BZIP2 + vector after_col_bzip2_sizes(relation.columns.size(), 0); + // LZ4 + vector after_col_lz4_sizes(relation.columns.size(), 0); + vector after_db_lz4_sizes(relation.columns.size(), 0); + // ------------------------------------------------------------------------------------- + tbb::parallel_for(SIZE(0), ranges.size(), [&](SIZE chunk_i) { + cengine::db::ThreadCache::get().dump_meta.chunk_i = chunk_i; + // ------------------------------------------------------------------------------------- + auto chunk = relation.getChunk(ranges, chunk_i); + auto db_meta = datablockV2.compress(chunk, compressed_chunks[chunk_i]); + // ------------------------------------------------------------------------------------- + // External compression tools + { + vector> external_compression_results(relation.columns.size()); + for ( u32 col_i = 0; col_i < relation.columns.size(); col_i++ ) { + if ( FLAGS_bzip2 ) { + after_col_bzip2_sizes[col_i] += BZIP2::getCompressedSize(chunk.columns[col_i].get(), chunk.size(col_i)); + } + if ( FLAGS_lz4 ) { + after_col_lz4_sizes[col_i] += LZ4::getCompressedSize(chunk.columns[col_i].get(), chunk.size(col_i)); + u8 *compressed_column_ptr; + u32 compressed_column_size; + datablockV2.getCompressedColumn(compressed_chunks[chunk_i], col_i, compressed_column_ptr, compressed_column_size); + after_db_lz4_sizes[col_i] += LZ4::getCompressedSize(compressed_column_ptr, compressed_column_size); + } + } + } + { + lock_guard lock(compression_summary_mutex); + for ( u32 col_i = 0; col_i < relation.columns.size(); col_i++ ) { + col_tuplet_counts[col_i] += chunk.tuple_count; + // ------------------------------------------------------------------------------------- + auto used_scheme = db_meta.used_compression_schemes[col_i]; + + if ( schemes_frequency[col_i].find(used_scheme) == schemes_frequency[col_i].end()) { + schemes_frequency[col_i].insert({used_scheme, 1}); + } else { + schemes_frequency[col_i][used_scheme]++; + } + if(FLAGS_include_null_bitmap) { + after_col_sizes[col_i] += db_meta.nullmap_sizes[col_i]; + } + after_col_sizes[col_i] += db_meta.data_sizes[col_i]; + before_col_sizes[col_i] += chunk.size(col_i); + db_metas.push_back(db_meta); + } + } + // ------------------------------------------------------------------------------------- + if ( FLAGS_verify ) { + spdlog::info("Verifying chunk_i = {}", chunk_i); + vector decompressed_chunks; + if ( !(datablockV2.decompress(compressed_chunks[chunk_i]) == chunk)) { + cerr << "Compressed != Decompressed; used scheme = " << CI(db_metas[chunk_i].used_compression_schemes[0]) << " for chunk_i = " << chunk_i << endl; + cerr << schema_path << endl; + cerr << FLAGS_single_in << endl; + } + } + spdlog::info("Release chunk_i = {}", chunk_i); + chunk.reset(); + compressed_chunks[chunk_i].reset(); + }); + // ------------------------------------------------------------------------------------- + // Print decision tree + if ( FLAGS_decision_tree != "" ) { + ofstream decision_tree_file; + decision_tree_file.open(FLAGS_decision_tree); + for ( auto i = cengine::db::ThreadCache::data.begin(); + i != cengine::db::ThreadCache::data.end(); ++i ) { + decision_tree_file << i->log.str(); + } + decision_tree_file.close(); + } + // ------------------------------------------------------------------------------------- + if ( FLAGS_estimation_deviation != "" ) { + ofstream estimation_deviation_file; + estimation_deviation_file.open(FLAGS_estimation_deviation, std::ofstream::out | std::ofstream::app); + if ( estimation_deviation_file.tellp() == 0 ) { + estimation_deviation_file << "rel_name\tcol_name\tcol_type\tblock_i\tlevel\tscheme_name\testimated_cf\tbefore_size\tafter_size\tactual_cf\tcomment\tunique_count\n"; + } + for ( auto i = cengine::db::ThreadCache::data.begin(); + i != cengine::db::ThreadCache::data.end(); ++i ) { + estimation_deviation_file << i->estimation_deviation_csv.str(); + } + estimation_deviation_file.close(); + } + // ------------------------------------------------------------------------------------- + if ( FLAGS_fsst_stats != "" ) { + ofstream fsst_stats_file; + fsst_stats_file.open(FLAGS_fsst_stats, std::ofstream::out | std::ofstream::app); + if ( fsst_stats_file.tellp() == 0 ) { + fsst_stats_file << "rel_name\tcol_name\tblock_i\tbefore_col\tbefore_pool\tafter_pool\tafter_col\n"; + } + for ( auto i = cengine::db::ThreadCache::data.begin(); + i != cengine::db::ThreadCache::data.end(); ++i ) { + fsst_stats_file << i->fsst_csv.str(); + } + fsst_stats_file.close(); + } + // ------------------------------------------------------------------------------------- + cout << setprecision(4) << fixed; + // ------------------------------------------------------------------------------------- + string db_version_name = string("db_db") + (FLAGS_db1 ? "1" : "2"); + for ( u32 col_i = 0; col_i < relation.columns.size(); col_i++ ) { + auto &column_result = results[col_i]; + auto &column = relation.columns[col_i]; + column_result[KEY_COLUMN_NAME] = column.name; + column_result[KEY_RELATION_NAME] = relation.name; + column_result["c_type"] = ConvertTypeToString(relation.columns[col_i].type); + column_result["db_before_size"] = to_string(before_col_sizes[col_i]); + column_result["db_tuple_count"] = to_string(col_tuplet_counts[col_i]); + // ------------------------------------------------------------------------------------- + column_result[db_version_name + "_size"] = to_string(after_col_sizes[col_i]); + column_result[db_version_name + "_bits_pe"] = to_string(CD(after_col_sizes[col_i]) * 8.0 / CD(col_tuplet_counts[col_i])); + // ------------------------------------------------------------------------------------- + // External compression : data aggregation + if ( FLAGS_bzip2 ) { + column_result["db_bzip2_size"] = to_string(after_col_bzip2_sizes[col_i]); + column_result["db_bzip2_bits_pe"] = to_string(CD(after_col_bzip2_sizes[col_i]) * 8.0 / CD(col_tuplet_counts[col_i])); + } + if ( FLAGS_lz4 ) { + column_result["db_lz4_size"] = to_string(after_col_lz4_sizes[col_i]); + column_result["db_lz4_bits_pe"] = to_string(CD(after_col_lz4_sizes[col_i]) * 8.0 / CD(col_tuplet_counts[col_i])); + column_result[db_version_name + "_lz4_size"] = to_string(after_db_lz4_sizes[col_i]); + column_result[db_version_name + "_lz4_bits_pe"] = to_string(CD(after_db_lz4_sizes[col_i]) * 8.0 / CD(col_tuplet_counts[col_i])); + } + // ------------------------------------------------------------------------------------- + if ( FLAGS_db2 ) { + for ( u8 scheme_i = 0; scheme_i < FLAGS_schemes; scheme_i++ ) { + auto it = schemes_frequency[col_i].find(scheme_i); + if ( it == schemes_frequency[col_i].end()) { + column_result["s_" + to_string(scheme_i)] = "0"; + } else { + column_result["s_" + to_string(scheme_i)] = to_string(it->second); + } + } + } + } + // ------------------------------------------------------------------------------------- + // Print header + if ( FLAGS_print_csv ) { + if ( FLAGS_print_header ) { + for ( auto &map_tuple: results[0] ) { + cout << map_tuple.first << '\t'; + } + cout << endl; + } + for ( auto &column_result: results ) { + for ( auto &map_tuple : column_result ) { + cout << map_tuple.second << '\t'; + } + cout << endl; + } + } + return 0; +} diff --git a/benchmarks/analyze_better_blocks/playground/.dir-locals.el b/benchmarks/analyze_better_blocks/playground/.dir-locals.el new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/playground/double.cpp b/benchmarks/analyze_better_blocks/playground/double.cpp new file mode 100644 index 0000000..36c8bcd --- /dev/null +++ b/benchmarks/analyze_better_blocks/playground/double.cpp @@ -0,0 +1,120 @@ +#include "Units.hpp" +#include "Reinterpret.hpp" +// ------------------------------------------------------------------------------------- +#include "PerfEvent.hpp" +// ------------------------------------------------------------------------------------- +#include +#include +#include // setprecision +#include +#include +#include + +#include "double-conversion/diy-fp.h" +#include "double-conversion/fast-dtoa.h" +#include "double-conversion/ieee.h" +#include "double-conversion/utils.h" + +// ------------------------------------------------------------------------------------- +using namespace std; +using namespace double_conversion; + +void printDouble(double input) +{ + union { + double d; + uint64_t u; + }; + + d = input; + bool sign = (u >> 63) & 0x1; + uint64_t exponent = (u >> 52) & 0x7FF; + uint64_t mantissa = u & 0xFFFFFFFFFFFFF; + + cout << sign << " " << bitset<11>(exponent) << " " << bitset<52>(mantissa) << " " + << std::setprecision(17) << d << endl; +} + +void printFloat(float input) +{ + union { + float d; + uint32_t u; + }; + + d = input; + bool sign = (u >> 31) & 0x1; + uint64_t exponent = (u >> 23) & 0xFF; + uint64_t mantissa = u & 0x7FFFFF; + + cout << sign << " " << bitset<8>(exponent) << " " << bitset<23>(mantissa) << " " << std::setprecision(7) << d << endl; +} +static Vector StringToVector(const char *str) +{ + return Vector(str, strlen(str)); +} + +const u8 max_exponent = 22; +static const double exact_powers_of_ten[] = { + 1.0, // 10^0 + 10.0, 100.0, 1000.0, 10000.0, 100000.0, 1000000.0, 10000000.0, 100000000.0, 1000000000.0, 10000000000.0, // 10^10 + 100000000000.0, 1000000000000.0, 10000000000000.0, 100000000000000.0, 1000000000000000.0, 10000000000000000.0, 100000000000000000.0, 1000000000000000000.0, 10000000000000000000.0, 100000000000000000000.0, // 10^20 + 1000000000000000000000.0, + // 10^22 = 0x21e19e0c9bab2400000 = 0x878678326eac9 * 2^22 + 10000000000000000000000.0 +}; +int main(int, char **) +{ + std::string test_double("1.5"); + double d = std::stod(test_double); + float d_f = static_cast(d); + cout << ((static_cast(d_f) == d) ? "Y" : "N") << endl; + + //double wtf = std::stod("-2.2e-2"); +// printFloat(f); +// printDouble(d); +// // return 0; +// // ------------------------------------------------------------------------------------- +// // GDouble printing algorithm + +// char buffer_container[1024]; +// Vector buffer(buffer_container, 1024); +// int length; +// int point; +// bool status; + +// double min_double = std::stod(test_double); +// status = FastDtoa(min_double, FAST_DTOA_SHORTEST, 0, +// buffer, &length, &point); + + // ------------------------------------------------------------------------------------- + // Try optimized print double + { + // d = stod("128.83");//cin >> d; 128.83 16.24 + double d, cd; + cout << "enter your decimal to convert : "; + cin >> d; + + double current_double = d; + u32 e; + u64 sd; + bool convertable = false; + for ( e = 0; e <= max_exponent; e++ ) { + double cd = current_double * exact_powers_of_ten[e]; + cd = std::round(cd); + sd = static_cast(cd); + double if_converted_back = CD(sd) / exact_powers_of_ten[e]; + if ( if_converted_back == current_double && ((std::floor(std::log2(sd)) + 1) <= 32)) { + cout << "awesome here is your i = " << sd << ", e = " << e << endl; + convertable = true; + break; + } + } + if(!convertable){ + cout << "damn, not convertible !" << endl; + } + } + // ------------------------------------------------------------------------------------- + return 0; +} +// ------------------------------------------------------------------------------------- diff --git a/benchmarks/analyze_better_blocks/playground/double_benchmarking.cpp b/benchmarks/analyze_better_blocks/playground/double_benchmarking.cpp new file mode 100644 index 0000000..7d43ecd --- /dev/null +++ b/benchmarks/analyze_better_blocks/playground/double_benchmarking.cpp @@ -0,0 +1,75 @@ +#include "Units.hpp" +// ------------------------------------------------------------------------------------- +#include "PerfEvent.hpp" +// ------------------------------------------------------------------------------------- +#include +#include +#include // setprecision +#include +#include +#include + +#include "double-conversion/strtod.h" +#include "double-conversion/utils.h" + +// ------------------------------------------------------------------------------------- +using namespace std; +using namespace double_conversion; + +void printDouble(double input) +{ + union { + double d; + uint64_t u; + }; + + d = input; + bool sign = (u >> 63) & 0x1; + uint64_t exponent = (u >> 52) & 0x7FF; + uint64_t mantissa = u & 0xFFFFFFFFFFFFF; + + cout << sign << " " << bitset<11>(exponent) << " " << bitset<52>(mantissa) << " " + << std::setprecision(17) << d << endl; +} + +void printFloat(float input) +{ + union { + float d; + uint32_t u; + }; + + d = input; + bool sign = (u >> 31) & 0x1; + uint64_t exponent = (u >> 23) & 0xFF; + uint64_t mantissa = u & 0x7FFFFF; + + cout << sign << " " << bitset<8>(exponent) << " " << bitset<23>(mantissa) << " " << std::setprecision(7) << d << endl; +} +static Vector StringToVector(const char* str) { + return Vector(str, strlen(str)); +} +int main(int, char **) +{ + auto my_vec = StringToVector("123"); + PerfEvent e; + { + PerfEventBlock b(e, 65000); + for(u32 i=0;i< 65000; i++) + double_conversion::Strtod(my_vec, -1); + } + { + PerfEventBlock b(e, 65000); + for(u32 i=0;i< 65000; i++){ + double d = 123456789; + d/=10; + d/=10; + d/=10; + d/=10; + + } + + } + return 0; +} +// ------------------------------------------------------------------------------------- diff --git a/benchmarks/analyze_better_blocks/playground/fetch-cols.sh b/benchmarks/analyze_better_blocks/playground/fetch-cols.sh new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/playground/fetch-double-cols.sh b/benchmarks/analyze_better_blocks/playground/fetch-double-cols.sh new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/playground/for_tests.cpp b/benchmarks/analyze_better_blocks/playground/for_tests.cpp new file mode 100644 index 0000000..47909ba --- /dev/null +++ b/benchmarks/analyze_better_blocks/playground/for_tests.cpp @@ -0,0 +1,56 @@ +// +// Created by david on 10.06.22. +// + +#include "Units.hpp" +#include "datablock/schemes/v2/integer/PBP.hpp" +#include "datablock/schemes/CScheme.hpp" + +template +static void print_vec(std::vector &v, const char *msg) { + std::cerr << msg << ": "; + for (const T &elem : v) { + std::cerr << elem << " "; + } + std::cerr << std::endl; +} + +static bool test_compression(std::vector &src, cengine::db::IntegerScheme &scheme) { + std::vector dst(src.size(), 0); + size_t original_size = src.size() * sizeof(INTEGER); + std::vector compressed(original_size * 10); // Size * 10 just to be sure + + auto src_ptr = src.data(); + auto compressed_ptr = reinterpret_cast(compressed.data()); + auto dst_ptr = dst.data(); + + cengine::db::SInteger32Stats stats(src_ptr, nullptr, src.size()); + + u32 compressed_size = scheme.compress(src_ptr, nullptr, compressed_ptr, stats, 0); + std::cerr << "Compression done. Old size: " << original_size << " new size: " << compressed_size << std::endl; + + scheme.decompress(dst_ptr, nullptr, compressed_ptr, stats.tuple_count, 0); + std::cerr << "Decompression done." << std::endl; + print_vec(src, "src"); + print_vec(dst, "dst"); + + bool ret = src == dst; + + // Zero out dst + std::fill(dst.begin(), dst.end(), 0); + + return ret; +} + +int main(void) { + std::vector src {1, -1, 0, -10, 10, -300, -2147483648, 2147483647, -2, 128}; + bool result; + + cengine::db::v2::integer::PBP pbp; + result = test_compression(src, pbp); + assert(result); + + cengine::db::v2::integer::FBP fbp; + result = test_compression(src, fbp); + assert(result); +} diff --git a/benchmarks/analyze_better_blocks/playground/fsst_0.cpp b/benchmarks/analyze_better_blocks/playground/fsst_0.cpp new file mode 100644 index 0000000..5bc559c --- /dev/null +++ b/benchmarks/analyze_better_blocks/playground/fsst_0.cpp @@ -0,0 +1,35 @@ +#include "fsst.h" +#include "Units.hpp" +#include "MMapvector.hpp" +#include +#include + +using namespace std; + +/* + * corner case: setting n to 0, makes fsst hangs with 100% core usage. + */ +int main(int argc, char **argv) +{ + unsigned long fsst_n = 1993; + + auto input_string_buffers = std::unique_ptr(new u8 *[fsst_n]); + auto input_string_lengths = std::unique_ptr(new u64[fsst_n]); + auto output_string_buffers = std::unique_ptr(new u8 *[fsst_n]); + auto output_string_lengths = std::unique_ptr(new u64[fsst_n]); + + + Vector fsst_strings; + fsst_strings.readBinary("fsst_strings"); + Vector fsst_lengths; + fsst_lengths.readBinary("fsst_lengths"); + + for(u32 s_i =0; s_i < fsst_n; s_i++) { + cout << fsst_strings[s_i] << endl; + input_string_buffers[s_i] = (u8*) malloc(fsst_lengths[s_i]); + memcpy(input_string_buffers[s_i],fsst_strings[s_i].data(), fsst_lengths[s_i]); + input_string_lengths[s_i] = fsst_lengths[s_i]; + } + fsst_encoder_t *encoder = fsst_create(fsst_n, input_string_lengths.get(), input_string_buffers.get(), 0); + return 0; +} \ No newline at end of file diff --git a/benchmarks/analyze_better_blocks/playground/fsst_benchmark.cpp b/benchmarks/analyze_better_blocks/playground/fsst_benchmark.cpp new file mode 100644 index 0000000..6e441d7 --- /dev/null +++ b/benchmarks/analyze_better_blocks/playground/fsst_benchmark.cpp @@ -0,0 +1,250 @@ +#include +#include +#include +#include + +#include "gflags/gflags.h" +#include "spdlog/common.h" +#include "spdlog/spdlog.h" +#include "spdlog/fmt/bundled/ranges.h" + +#include "PerfEvent.hpp" +#include "PerfExternal.hpp" +#include "MMapvector.hpp" + +#include "fsst.h" + +using std::stringstream; + +DEFINE_string(fsst_stats, "", ""); +DEFINE_string(file_list_file, "pbi-string-columns.txt", "file-list"); +DEFINE_bool(log_info, false, "log"); + +struct InputFiles { + std::ifstream list; + InputFiles(const std::string& filename) : list(filename) + { + std::cout << "file " << filename << std::endl; + } + + bool next(std::string& output) + { + return !(std::getline(list, output).fail()); + } +}; + + +std::string ensure_file(const std::string& object) +{ + static const std::string bucket = "s3://public-bi-benchmark/binary/"; + std::string outfile = "columns/" + object; + stringstream _cmd; + _cmd << "bash -c 'mkdir -p columns; test -f \"" << outfile + << "\" && echo \"file exists, skipping download\" || (echo " + "\"downloading file\"; aws s3 cp \"" + << bucket << object << "\" \"" << outfile << "\")' 1>&2"; + std::string cmd(_cmd.str()); + spdlog::info("running {}", cmd); + system(cmd.c_str()); + return outfile; +} + +using Column = Vector; +constexpr size_t SCRATCH_SIZE = 3ul*1024*1024*1024; + +struct FSSTRuntime { + inline static unsigned char* compressed; + inline static unsigned char* decompressed; + + unsigned char* input_data_start; + size_t len_sum{0}; + Column& input; + unsigned char ** input_buffers, **output_buffers; + uint64_t* input_lengths, *output_lengths; + + explicit FSSTRuntime(Column& input) + : input(input) + , input_buffers(new unsigned char*[input.size()]) + , output_buffers(new unsigned char*[input.size()]) + , input_lengths(new uint64_t[input.size()]) + , output_lengths(new uint64_t[input.size()]) { + + if (!compressed) { + compressed = new unsigned char[SCRATCH_SIZE]; + } + + if (!decompressed) { + decompressed = new unsigned char[SCRATCH_SIZE]; + } + + input_data_start = reinterpret_cast(const_cast(input[0].data())); + for (unsigned str_i = 0; str_i < input.size(); str_i++) { + auto strview = input[str_i]; + auto* ptr = reinterpret_cast(const_cast(strview.data())); + input_data_start = std::min(ptr, input_data_start); + // fsst interface does not specify const :-< + input_buffers[str_i] = ptr; + input_lengths[str_i] = strview.size(); + len_sum += strview.size(); + } + } + + ~FSSTRuntime() { + delete[] input_lengths; + delete[] input_buffers; + delete[] output_lengths; + delete[] output_buffers; + } + + size_t size() const { return input.size(); } + + void assert_equal(size_t size) { + die_if(memcmp(input_data_start, decompressed, size) == 0); + } + + void clear(bool compr = true) { + memset(decompressed, 0, SCRATCH_SIZE); + if (compr) { + memset(compressed, 0, SCRATCH_SIZE); + } + } +}; + +uint64_t compress(FSSTRuntime& rt, PerfEvent& e) +{ + uint64_t fsst_strings_used_space; + e.setParam("phase", "compress"); + { + //PerfEventBlock perf(e, rt.size()); + auto* write_ptr = rt.compressed; + auto* encoder = + fsst_create(rt.size(), rt.input_lengths, rt.input_buffers, 0); + die_if(fsst_export(encoder, write_ptr) > 0); + auto fsst_table_used_space = FSST_MAXHEADER; + write_ptr += fsst_table_used_space; + + auto count = fsst_compress(encoder, rt.size(), rt.input_lengths, + rt.input_buffers, SCRATCH_SIZE - FSST_MAXHEADER, + write_ptr, rt.output_lengths, rt.output_buffers); + die_if(count == rt.size()); + + fsst_strings_used_space = + rt.output_lengths[rt.size() - 1] + + (rt.output_buffers[rt.size() - 1] - rt.output_buffers[0]); + + fsst_destroy(encoder); + } + return fsst_strings_used_space; +} + +constexpr size_t decomp_cnt = 1e9; + +void test_individual(FSSTRuntime& rt, PerfEvent& e, uint64_t& fsst_strings_used_space) { + if(fsst_strings_used_space == 0) { fsst_strings_used_space = compress(rt, e); }; + e.setParam("phase", "decompress"); + uint64_t decompressed_size; + size_t repeat = std::max(1ul, decomp_cnt/rt.size()); + { + PerfEventBlock perf(e, repeat*rt.size()); + //PerfExternalBlock perf; + auto* read_ptr = rt.compressed; + auto* write_ptr = rt.decompressed; + + fsst_decoder_t decoder; + auto header_bytes = fsst_import(&decoder, read_ptr); + die_if(header_bytes > 0 && header_bytes < FSST_MAXHEADER); + read_ptr += FSST_MAXHEADER; + + + for (auto i = 0u; i != repeat; ++i) { + + write_ptr = rt.decompressed; + read_ptr = rt.compressed + FSST_MAXHEADER; + //#pragma GCC unroll 32 + for (auto str_i = 0u; str_i != rt.size(); ++str_i) { + auto len = rt.output_lengths[str_i]; + auto bytes = fsst_decompress(&decoder, len, read_ptr, SCRATCH_SIZE, + write_ptr); + // assert(bytes == rt.input_lengths[str_i]); + read_ptr += len; + write_ptr += bytes; + } + } + decompressed_size = write_ptr - rt.decompressed; + } + rt.assert_equal(decompressed_size); +} + +void test_batch(FSSTRuntime& rt, PerfEvent& e, uint64_t& fsst_strings_used_space) { + if(fsst_strings_used_space == 0) { fsst_strings_used_space = compress(rt, e); }; + e.setParam("phase", "decompress"); + uint64_t decompressed_size; + + size_t repeat = std::max(1ul, decomp_cnt/rt.size()); + { + PerfEventBlock perf(e, repeat*rt.size()); + //PerfExternalBlock perf; + auto* read_ptr = rt.compressed; + auto* write_ptr = rt.decompressed; + + fsst_decoder_t decoder; + auto header_bytes = fsst_import(&decoder, read_ptr); + die_if(header_bytes > 0 && header_bytes < FSST_MAXHEADER); + read_ptr += FSST_MAXHEADER; + + auto compressed_strings = read_ptr; + + for (auto i = 0u; i != repeat; ++i) { + write_ptr = rt.decompressed; + read_ptr = rt.compressed + FSST_MAXHEADER; + write_ptr += fsst_decompress( + &decoder, fsst_strings_used_space, compressed_strings, + SCRATCH_SIZE, write_ptr); + } + decompressed_size = write_ptr - rt.decompressed; + } + rt.assert_equal(decompressed_size); +} + + + +int main(int argc, char *argv[]) { + gflags::ParseCommandLineFlags(&argc, &argv, true); + spdlog::set_level(FLAGS_log_info ? spdlog::level::info : spdlog::level::warn); + PerfEvent perf; + std::cerr << "using input file " << FLAGS_file_list_file << std::endl; + + InputFiles file_list(FLAGS_file_list_file); + + std::string nextfile; + int i = 0; + while (file_list.next(nextfile)) { + if (i++ > 10) { break; } + + std::string uncompfile = ensure_file(nextfile); + Column input(uncompfile.c_str()); + + FSSTRuntime rt(input); + uint64_t compressed_size = compress(rt, perf); + + perf.setParam("column", nextfile); + perf.setParam("strlen", ((double)rt.len_sum)/rt.size()); + perf.setParam("compr", ((double)rt.len_sum)/compressed_size); + perf.setParam("count", ((double)rt.len_sum)/compressed_size); + + perf.setParam("type", "single"); + test_individual(rt, perf, compressed_size); + + rt.clear(false); + + perf.setParam("type", "single"); + test_individual(rt, perf, compressed_size); + + rt.clear(false); + + perf.setParam("type", "batch"); + test_batch(rt, perf, compressed_size); + + rt.clear(); + } +} diff --git a/benchmarks/analyze_better_blocks/playground/generate_s3_data.cpp b/benchmarks/analyze_better_blocks/playground/generate_s3_data.cpp new file mode 100644 index 0000000..41ad08d --- /dev/null +++ b/benchmarks/analyze_better_blocks/playground/generate_s3_data.cpp @@ -0,0 +1,189 @@ +#include +#include +#include + +#include "tbb/parallel_for.h" +#include "tbb/task_scheduler_init.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static const char * const bucket_prefix = "bucketprefix-s3-test-data"; +static const size_t part_size = 128 * 1024 * 1024; + +static void usage(const char *program) { + std::cerr << "Usage: " << program << " " << std::endl; + exit(EXIT_FAILURE); +} + +static std::string get_key(long object_size, long object_idx) { + std::stringstream key; + key << object_size << "/" << object_idx; + return key.str(); +} + +static bool put_object(const Aws::S3Crt::S3CrtClient &s3_client, const std::string &bucket, std::shared_ptr &sstream, long object_idx, long object_size) { + Aws::S3Crt::Model::PutObjectRequest put_request; + + put_request.SetBucket(bucket); + put_request.SetKey(get_key(object_size, object_idx)); + put_request.SetBody(sstream); + + Aws::S3Crt::Model::PutObjectOutcome outcome = s3_client.PutObject(put_request); + + if (!outcome.IsSuccess()) { + std::cerr << "Put request failed" << std::endl; + return false; + } + return true; +} + +static void generate_data(std::shared_ptr &sstream, long object_size) { + thread_local std::random_device rnd_device; + // Specify the engine and distribution. + thread_local std::mt19937 mersenne_engine {rnd_device()}; // Generates random integers + thread_local std::uniform_int_distribution dist {std::numeric_limits::min(), std::numeric_limits::max()}; + + while(object_size--) { + char c = static_cast(dist(mersenne_engine)); + sstream->put(c); + } +} + +static std::pair upload_part(const Aws::S3Crt::S3CrtClient &s3_client, const std::string &bucket, std::string &key, std::string &upload_id, size_t part_number, std::shared_ptr &sstream) { + Aws::S3Crt::Model::UploadPartRequest upload_request; + upload_request.SetUploadId(upload_id); + upload_request.SetBucket(bucket); + upload_request.SetKey(key); + upload_request.SetPartNumber(part_number); + upload_request.SetBody(sstream); + + auto upload_outcome = s3_client.UploadPart(upload_request); + return {upload_outcome.IsSuccess(), upload_outcome.GetResult().GetETag()}; +} + +static void generate_and_upload_multipart(const Aws::S3Crt::S3CrtClient &s3_client, const std::string &bucket, long number_of_objects, long object_size) { + tbb::parallel_for(long(0), number_of_objects, [&](long object_idx) { + auto key = get_key(object_size, object_idx); + + /* Create Upload */ + Aws::S3Crt::Model::CreateMultipartUploadRequest create_upload_request; + create_upload_request.SetBucket(bucket); + create_upload_request.SetKey(key); + auto create_upload_outcome = s3_client.CreateMultipartUpload(create_upload_request); + if (!create_upload_outcome.IsSuccess()) { + throw std::logic_error("Failed to create Mulitpart Upload"); + } + auto upload_id = create_upload_outcome.GetResult().GetUploadId(); + + /* Upload parts */ + size_t num_parts = (object_size + part_size - 1) / part_size; + std::vector completed_parts(num_parts); + tbb::parallel_for(size_t(1), num_parts+1, [&](size_t part_number){ + auto sstream = std::make_shared(); + generate_data(sstream, std::min(part_size, static_cast(object_size))); + auto [success, etag] = upload_part(s3_client, bucket, key, upload_id, part_number, sstream); + if (!success) { + bool cleanup_done = false; + while(!cleanup_done) { + std::cerr << "Uploading part " << part_number << " for id " << upload_id << " failed. Attempting abort.." << std::endl; + Aws::S3Crt::Model::AbortMultipartUploadRequest abort_upload_request; + abort_upload_request.SetUploadId(upload_id); + abort_upload_request.SetBucket(bucket); + abort_upload_request.SetKey(key); + + auto abort_upload_outcome = s3_client.AbortMultipartUpload(abort_upload_request); + cleanup_done = abort_upload_outcome.IsSuccess(); + + // List parts to make sure there is nothing left. The docs says we have to do this to ensure proper cleanup. + Aws::S3Crt::Model::ListPartsRequest list_parts_request; + list_parts_request.SetUploadId(upload_id); + list_parts_request.SetBucket(bucket); + list_parts_request.SetKey(key); + auto list_parts_outcome = s3_client.ListParts(list_parts_request); + if (list_parts_outcome.IsSuccess()) { + auto parts_left = list_parts_outcome.GetResult().GetParts().size(); + cleanup_done &= parts_left == 0; + } else { + cleanup_done = false; + } + } + + throw std::runtime_error("uploading part failed"); + } + completed_parts[part_number-1].SetPartNumber(part_number); + completed_parts[part_number-1].SetETag(etag); + }); + + + /* Complete Upload */ + Aws::S3Crt::Model::CompletedMultipartUpload completed_upload; + completed_upload.SetParts(completed_parts); + + Aws::S3Crt::Model::CompleteMultipartUploadRequest complete_upload_request; + complete_upload_request.SetUploadId(upload_id); + complete_upload_request.SetBucket(bucket); + complete_upload_request.SetKey(key); + complete_upload_request.SetMultipartUpload(completed_upload); + auto complete_upload_outcome = s3_client.CompleteMultipartUpload(complete_upload_request); + if (!complete_upload_outcome.IsSuccess()) { + throw std::logic_error("Failed to complete Mulitpart Upload"); + } + }); +} + +static void generate_and_upload(const Aws::S3Crt::S3CrtClient &s3_client, const std::string &bucket, long number_of_objects, long object_size) { + tbb::parallel_for(long(0), number_of_objects, [&](long object_idx) { + auto sstream = std::make_shared(); + generate_data(sstream, object_size); + + if (!put_object(s3_client, bucket, sstream, object_idx, object_size)) { + throw std::runtime_error("Upload to S3 failed"); + } + }); +} + +int main(int argc, char **argv) { + if (argc != 4) { + usage(argv[0]); + } + + long number_of_objects = std::stol(argv[1]); + long object_size = std::stol(argv[2]); + auto region = argv[3]; + std::stringstream bucket; + bucket << bucket_prefix << "-" << region; + + //tbb::task_scheduler_init init(1); + + //tbb::task_scheduler_init init(1); + + Aws::SDKOptions options; + Aws::InitAPI(options); + + { + Aws::S3Crt::ClientConfiguration config; + config.partSize = part_size; + config.throughputTargetGbps = 100.0; + config.region = region; + + Aws::S3Crt::S3CrtClient s3_client(config); + +//#define MULTIPART +#ifdef MULTIPART + generate_and_upload_multipart(s3_client, bucket.str(), number_of_objects, object_size); +#else + generate_and_upload(s3_client, bucket.str(), number_of_objects, object_size); +#endif + } + + Aws::ShutdownAPI(options); +} diff --git a/benchmarks/analyze_better_blocks/playground/local.cmake b/benchmarks/analyze_better_blocks/playground/local.cmake new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/playground/pbi-double-columns.txt b/benchmarks/analyze_better_blocks/playground/pbi-double-columns.txt new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/playground/pbi-string-columns.txt b/benchmarks/analyze_better_blocks/playground/pbi-string-columns.txt new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/playground/playground.cpp b/benchmarks/analyze_better_blocks/playground/playground.cpp new file mode 100644 index 0000000..04ca964 --- /dev/null +++ b/benchmarks/analyze_better_blocks/playground/playground.cpp @@ -0,0 +1,124 @@ +#include "Units.hpp" +#include +#include +#include +#include +#include "fsst.h" +#include /* srand, rand */ +#include /* time */ +#include "gflags/gflags.h" +#include "PerfEvent.hpp" +// ------------------------------------------------------------------------------------- +#include "headers/codecfactory.h" +#include "headers/deltautil.h" + +DEFINE_uint32(n, 10, ""); +DEFINE_uint32(s, 100, ""); +DEFINE_uint32(e, 26, ""); + +using namespace std; +int main(int argc, char **argv) +{ + gflags::SetUsageMessage(""); + gflags::ParseCommandLineFlags(&argc, &argv, true); + + // ------------------------------------------------------------------------------------- + auto input_size = 1024; + auto in_arr = std::unique_ptr(new u32[input_size]); + using namespace FastPForLib; + for(int t = 0; t < input_size; t++) { + in_arr[t] = (1 << 1); + } + in_arr[10] = u32(1 << 31) - 1; + + auto fast_pfor = std::shared_ptr( + new CompositeCodec, VariableByte>()); + auto codec = std::shared_ptr( + new CompositeCodec, VariableByte>()); // + + size_t compressed_codes_size = input_size * 2; // not really used + // ------------------------------------------------------------------------------------- + auto dest = std::unique_ptr(new u32[input_size * 2]); + auto dest_integer = reinterpret_cast(dest.get()); + dest_integer = (dest_integer + 32) & ~3ul; + auto dest_4_aligned = reinterpret_cast(dest_integer); + // ------------------------------------------------------------------------------------- + fast_pfor->encodeArray(in_arr.get(), input_size, dest_4_aligned, compressed_codes_size); + cout << compressed_codes_size << std::endl; + // ------------------------------------------------------------------------------------- + codec->encodeArray(in_arr.get(), input_size, dest_4_aligned, compressed_codes_size); + cout << compressed_codes_size << std::endl; + return 0; + // ------------------------------------------------------------------------------------- + for(int t = 0; t < input_size; t++) { + in_arr[t] -= (1 << 24); + } +// ------------------------------------------------------------------------------------- + fast_pfor->encodeArray(in_arr.get(), input_size, dest_4_aligned, compressed_codes_size); + cout << compressed_codes_size << std::endl; + // ------------------------------------------------------------------------------------- + codec->encodeArray(in_arr.get(), input_size, dest_4_aligned, compressed_codes_size); + cout << compressed_codes_size << std::endl; + // ------------------------------------------------------------------------------------- + + return 0; + // ------------------------------------------------------------------------------------- + + srand(time(NULL)); + const u32 n = 10, s = FLAGS_s; + unsigned char *srcBuf[n] = {}; + unsigned char *dstBuf[n] = {}; + unsigned long srcLen[n] = {}; + unsigned long dstLen[n] = {}; +// unsigned long dstLen[2] = { 0, 0 }; + + auto in_array = std::unique_ptr[]>(); + in_array = std::unique_ptr[]>(new std::unique_ptr[n]); + + for ( auto i = 0; i < n; i++ ) { + srcBuf[i] = (u8 *) malloc(s); + for ( u32 b_i = 0; b_i < s; b_i++ ) { + srcBuf[i][b_i] = 65 + rand() % FLAGS_e; + } + srcLen[i] = s; + } + unsigned char serialized_encoder_buf[FSST_MAXHEADER]; + fsst_encoder_t *encoder = fsst_create(n, srcLen, srcBuf, 0); + unsigned long hdr = fsst_export(encoder, serialized_encoder_buf); + + auto output_buffer_size = 10 * 1024 * 1024; + auto output_buffer = (u8 *) malloc(output_buffer_size); + + auto n_compressed_strings = fsst_compress(encoder, n, srcLen, srcBuf, output_buffer_size, output_buffer, + dstLen, dstBuf); + assert(n_compressed_strings == n); + //fsst_destroy(encoder); + + // ------------------------------------------------------------------------------------- + // decompress time + unsigned char *decompressedBuf[n] = {}; + fsst_decoder_t decoder; + PerfEvent e; + { + PerfEventBlock block(e,1); + decoder = fsst_decoder(encoder); + } + cout << sizeof(fsst_decoder_t) << endl; + cout << FSST_MAXHEADER << endl; + { + PerfEventBlock block(e,1); + fsst_import(&decoder, serialized_encoder_buf); + } + + for ( auto i = 0; i < n; i++ ) { + decompressedBuf[i] = (u8 *) malloc(s); + assert(fsst_decompress(&decoder, dstLen[i], dstBuf[i], s, decompressedBuf[i]) == s); + assert(memcmp(decompressedBuf[i], srcBuf[i], s) == 0); + } + + return 0; +} +/* + * Notes: + * + */ diff --git a/benchmarks/analyze_better_blocks/playground/pseudodecimal_benchmark.cpp b/benchmarks/analyze_better_blocks/playground/pseudodecimal_benchmark.cpp new file mode 100644 index 0000000..2a8c18f --- /dev/null +++ b/benchmarks/analyze_better_blocks/playground/pseudodecimal_benchmark.cpp @@ -0,0 +1,176 @@ +#include "datablock/schemes/CSchemePicker.hpp" +#include "datablock/schemes/v2/double/DynamicDictionary.hpp" +#include "datablock/schemes/v2/double/Frequency.hpp" +#include "datablock/schemes/v2/double/RLE.hpp" +#include "datablock/schemes/v2/double/DoubleBP.hpp" + +#include "datablock/schemes/v2/integer/PBP.hpp" +#include "gflags/gflags.h" +#include "spdlog/spdlog.h" +#include "spdlog/fmt/bundled/ranges.h" + +#include "Units.hpp" +#include "PerfEvent.hpp" +#include "datablock/schemes/v2/double/Decimal.hpp" +#include "datablock/schemes/CScheme.hpp" +#include "datablock/schemes/CSchemePool.hpp" + +#include +#include +#include + + +// for some reason, this is only DECLARED in DynamicDictionary but not defined (breaks linking) +// and then DEFINED in every cpp file that uses it +DEFINE_string(fsst_stats, "", ""); +DEFINE_string(file_list_file, "pbi-double-columns.txt", "file-list"); +DEFINE_int32(cascade_depth, 1, "cascade"); + +// example2.double: s3://public-bi-benchmark/binary/Telco/1/Telco_1/106_RECHRG_INC_MIN_USED_P1.double +// example2.bitmap: s3://public-bi-benchmark/binary/Telco/1/Telco_1/106_RECHRG_INC_MIN_USED_P1.bitmap + +struct InputFiles { + std::ifstream list; + InputFiles(const std::string& filename) : list(filename) { + std::cout << "file " << filename << std::endl; + } + + bool next(std::string& output) { + return !(std::getline(list, output).fail()); + } +}; + +std::string ensure_file(const std::string& object) +{ +// static const std::string bucket = "s3://public-bi-benchmark/binary/"; +// std::string outfile = "columns/" + object; +// std::stringstream _cmd; +// _cmd << "bash -c 'mkdir -p columns; test -f \"" << outfile +// << "\" && echo \"file exists, skipping download\" || (echo " +// "\"downloading file\"; aws s3 cp \"" +// << bucket << object << "\" \"" << outfile << "\")'"; +// std::string cmd(_cmd.str()); +// spdlog::info("running {}", cmd); +// system(cmd.c_str()); +// return outfile; + return object; +} + +using T = double; +bool test_compression(cengine::db::DoubleScheme &scheme, cengine::db::DoubleStats& stats, T* src, size_t size, PerfEvent& e, u8 cascade) { + std::vector compressed(size * sizeof(T) * 2); + std::vector dst(size * 2, 0); + + auto src_ptr = src; + auto compressed_ptr = reinterpret_cast(compressed.data()); + auto dst_ptr = dst.data(); + + size_t output_bytes{0}; + e.setParam("cascade", cascade); + e.setParam("phase", "compression"); + { + PerfEventBlock blk(e, size); + output_bytes = scheme.compress(src_ptr, nullptr, compressed_ptr, stats, cascade); + //std::cout << "cf: " << 1.0 * size * sizeof(T) / output_bytes << std::endl; + e.setParam("compr", (1.0 * size * sizeof(T)) / output_bytes); + } + + e.setParam("phase", "decompression"); + { + PerfEventBlock blk(e, size); + scheme.decompress(dst_ptr, nullptr, compressed_ptr, stats.tuple_count, cascade); + } + //std::cerr << "Decompression done." << std::endl; + for (auto i = 0ul; i != size; ++i) { + die_if(src[i] == dst[i]); + } + + return 0; +} + +void setupSchemePool() +{ + using namespace cengine::db; + cengine::db::CSchemePool::refresh(); + auto& schemes = *cengine::db::CSchemePool::available_schemes; + return; + // double: DOUBLE_BP, UNCOMPRESSED, + for (auto it = schemes.double_schemes.begin(); + it != schemes.double_schemes.end();) { + if (it->first != DoubleSchemeType::DOUBLE_BP && + it->first != DoubleSchemeType::UNCOMPRESSED) { + it = schemes.double_schemes.erase(it); + } else { + ++it; + } + } + // int: X_FBP, UNCOMPRESSED, + for (auto it = schemes.integer_schemes.begin(); it != schemes.integer_schemes.end();) { + if (it->first != IntegerSchemeType::X_FBP && + it->first != IntegerSchemeType::UNCOMPRESSED) { + it = schemes.integer_schemes.erase(it); + } else { + ++it; + } + } +} + +int main(int argc, char *argv[]) { + gflags::ParseCommandLineFlags(&argc, &argv, true); + setupSchemePool(); + spdlog::set_level(spdlog::level::info); + PerfEvent perf; + std::cerr << "using cascade depth " << FLAGS_cascade_depth << " and input file " << FLAGS_file_list_file << std::endl; + + InputFiles file_list(FLAGS_file_list_file); + + std::string nextfile; + int i = 0; + while (file_list.next(nextfile)) { + if (i++ > 10) { break; } + std::string outfile = ensure_file(nextfile); + + Vector doubles(outfile.c_str()); + + { + std::vector head(doubles.data, doubles.data + std::min(10ul, doubles.size())); + spdlog::info("size: {:03.2f} MiB, head: {}", (sizeof(T) * doubles.size()) * 1.0 / 1024 / 1024, head); + } + + perf.setParam("column", nextfile); + perf.setParam("scheme", "none"); + perf.setParam("compr", 1); + perf.setParam("cascade", 0); + + cengine::db::DoubleStats stats(doubles.data, nullptr, doubles.size()); + perf.setParam("phase", "stats"); + { + PerfEventBlock blk(perf, doubles.size()); + stats = cengine::db::DoubleStats::generateStats(doubles.data, nullptr, doubles.size()); + } + + perf.setParam("scheme", "bitpack"); + cengine::db::v2::d::DoubleBP bp; + test_compression(bp, stats, doubles.data, doubles.count, perf, 0); + + perf.setParam("scheme", "decimal"); + cengine::db::v2::d::Decimal pd; + test_compression(pd, stats, doubles.data, doubles.count, perf, 1); + test_compression(pd, stats, doubles.data, doubles.count, perf, 2); + + perf.setParam("scheme", "dict"); + cengine::db::v2::d::DynamicDictionary dict; + test_compression(dict, stats, doubles.data, doubles.count, perf, 1); + test_compression(dict, stats, doubles.data, doubles.count, perf, 2); + + perf.setParam("scheme", "rle"); + cengine::db::v2::d::RLE rle; + test_compression(rle, stats, doubles.data, doubles.count, perf, 1); + test_compression(dict, stats, doubles.data, doubles.count, perf, 2); + + //perf.setParam("scheme", "freq"); + //cengine::db::v2::d::Frequency freq; + //test_compression(freq, stats, doubles.data, doubles.count, perf, 1); + //test_compression(freq, stats, doubles.data, doubles.count, perf, 2); + } +} \ No newline at end of file diff --git a/benchmarks/analyze_better_blocks/playground/rle.cpp b/benchmarks/analyze_better_blocks/playground/rle.cpp new file mode 100644 index 0000000..fd456d5 --- /dev/null +++ b/benchmarks/analyze_better_blocks/playground/rle.cpp @@ -0,0 +1,130 @@ +// +// A simple example to get you started with the library. +// You can compile and run this example like so: +// +// make example +// ./example +// +// Warning: If your compiler does not fully support C++11, some of +// this example may require changes. +// + +#include "headers/codecfactory.h" +#include "headers/deltautil.h" +#include "Units.hpp" +#include "MMapvector.hpp" + +int main() +{ + // ------------------------------------------------------------------------------------- + // Randomness generators + std::srand(std::time(nullptr)); + // ------------------------------------------------------------------------------------- + using namespace FastPForLib; + IntegerCODEC &codec = *CODECFactory::getFromName("simdfastpfor256"); + size_t N = 1000 * 1000; + std::vector rle_input(N); + for ( uint32_t i = 0; i < N; i++ ) { + rle_input[i] = rand() % 100000; + if ( rand() % 10 > 2 ) { + size_t repeat = rand() % 7; + repeat = std::min(repeat, N - i - 1); + for ( size_t r_i = 1; r_i <= repeat; r_i++ ) { + rle_input[i + r_i] = rle_input[i]; + } + i += repeat; + } + } + writeBinary("rle_input.integer", rle_input); + // 30% chance -> repeat from 0 till 7 + + std::vector rle_output; + + std::vector rle_values; + std::vector rle_count; + + { + // RLE encoding + u32 last_item = rle_input[0]; + u32 count = 1; + for ( uint32_t i = 1; i < N; i++ ) { + if ( rle_input[i] == last_item ) { + count++; + } else { + rle_output.push_back(count); + rle_count.push_back(count); + rle_output.push_back(last_item); + rle_values.push_back(last_item); + last_item = rle_input[i]; + count = 1; + } + } + } + + cout << " b = " << rle_input.size() << " a = " << rle_output.size() << " r = " << 1.0 * rle_input.size() / rle_output.size() << endl; + cout <<"plain rle bytes = " << rle_output.size() * 4 << endl; + { + std::vector compressed_output(N + 1024); + size_t compressedsize = compressed_output.size(); + codec.encodeArray(rle_input.data(), rle_input.size(), compressed_output.data(), + compressedsize); + // + // if desired, shrink back the array: + compressed_output.resize(compressedsize); + compressed_output.shrink_to_fit(); + // display compression rate: + std::cout << std::setprecision(3); + std::cout << "You are using " + << 32.0 * static_cast(compressed_output.size()) / + static_cast(rle_input.size()) + << " bits per integer. " << std::endl; + + cout <<"direct bitpacking gives " << compressed_output.size() * 4 << endl; + } + + { + std::vector compressed_output(N + 1024); + size_t compressedsize = compressed_output.size(); + codec.encodeArray(rle_output.data(), rle_output.size(), compressed_output.data(), + compressedsize); + // + // if desired, shrink back the array: + compressed_output.resize(compressedsize); + compressed_output.shrink_to_fit(); + + cout <<"bitpacking rle plain gives " << compressed_output.size() * 4 << endl; + + } + // compress values and counts separately + { + u32 total_size = 0; + { + + //Delta::deltaSIMD(rle_values.data(), rle_values.size()); + + std::vector compressed_output(N + 1024); + size_t compressedsize = compressed_output.size(); + codec.encodeArray(rle_values.data(), rle_values.size(), compressed_output.data(), + compressedsize); + + total_size += compressedsize * 4; + } + { + std::vector compressed_output(N + 1024); + size_t compressedsize = compressed_output.size(); + codec.encodeArray(rle_count.data(), rle_count.size(), compressed_output.data(), + compressedsize); + + // if desired, shrink back the array: + compressed_output.resize(compressedsize); + compressed_output.shrink_to_fit(); + std::cout << std::setprecision(3); + std::cout << "You are using " + << 32.0 * static_cast(compressed_output.size()) / + static_cast(rle_count.size()) + << " bits per integer. " << std::endl; + total_size += compressedsize *4 ; + } + cout << " separate choice total_size " << total_size << endl; + } +} diff --git a/benchmarks/analyze_better_blocks/playground/s3-columns.txt b/benchmarks/analyze_better_blocks/playground/s3-columns.txt new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/playground/sampling_algorithms.cpp b/benchmarks/analyze_better_blocks/playground/sampling_algorithms.cpp new file mode 100644 index 0000000..6935e4f --- /dev/null +++ b/benchmarks/analyze_better_blocks/playground/sampling_algorithms.cpp @@ -0,0 +1,326 @@ +#include +#include "datablock/schemes/CSchemePicker.hpp" +#include "datablock/cache/ThreadCache.hpp" +#include "Units.hpp" +#include "MMapvector.hpp" + +#include "PerfEvent.hpp" +#include "spdlog/spdlog.h" + +#include +#include +#include +#include +#include +#include + +using namespace std::string_literals; +using namespace cengine::db; + +DEFINE_int32(bm_max_cascade_depth, 3, "maximum compression cascade depth."); +DEFINE_int32(bm_sample_count, 10, "sample count"); +DEFINE_int32(bm_sample_size, 64, "sample size"); +DEFINE_int32(bm_skip_files, 0, "skip these many files from the input file list"); +DEFINE_int32(bm_file_count, 20, "test these many files"); +DEFINE_string(bm_input_file_list, "s3-columns.txt", "the file listing the s3 input objects"); +DEFINE_bool(bm_clear_filecache, false, "whether to remove s3 dowloaded files"); + +DEFINE_bool(sampling_test_mode, true, "we are in sampling test mode, trying all schemes."); + +// for some reason, this is only DECLARED in DynamicDictionary but not defined (breaks linking) +// and then DEFINED in every cpp file that uses it +DEFINE_string(fsst_stats, "", ""); + +struct InputFiles { + std::ifstream list; + InputFiles(const std::string& filename) : list(filename) { + spdlog::info("s3 list file %s", filename); + } + + bool next(std::string& output) { + return !(std::getline(list, output).fail()); + } +}; + +std::string ensure_file(const std::string& object) { + static const std::string bucket = "s3://public-bi-benchmark/binary/"s; + std::string outfile = "columns/"s + object; + std::stringstream _cmd; + _cmd << "bash -c '(mkdir -p columns; test -f \"" << outfile + << "\" && echo \"file exists, skipping download\" || (echo " + "\"downloading file\"; aws s3 cp \"" + << bucket << object << "\" \"" << outfile << "\")) 1>&2'"; + std::string cmd(_cmd.str()); + spdlog::info("running {}", cmd); + system(cmd.c_str()); + return outfile; +} + +static constexpr size_t MAX_PBI_BYTES = 2019996069; +static constexpr size_t BLOCK_SIZE = 65000; + +struct TestResult { + + friend std::ostream& operator<<(std::ostream& s, const TestResult&) { + return s << "{}"; + } +}; + +template +using pool_t = std::unordered_map>; + +template +struct Schemes{}; +template <> +struct Schemes { + static constexpr char name[] = "integer"; + using picker = IntegerSchemePicker; + using stats = SInteger32Stats; + using scheme = IntegerScheme; + using scheme_enum = IntegerSchemeType; + using pool = pool_t; + + static const pool& schemes() { + return CSchemePool::available_schemes->integer_schemes; + } + + static const std::set excluded() { + //return {}; + return std::set({ IntegerSchemeType::ONE_VALUE }); + } +}; +template <> +struct Schemes { + static constexpr char name[] = "double"; + using picker = DoubleSchemePicker; + using stats = DoubleStats; + using scheme = DoubleScheme; + using scheme_enum = DoubleSchemeType; + using pool = pool_t; + + static const pool& schemes() { + return CSchemePool::available_schemes->double_schemes; + } + + static const std::set excluded() { + //return {}; + return std::set({ DoubleSchemeType::ONE_VALUE }); + } +}; +template <> +struct Schemes { + static constexpr char name[] = "string"; + using picker = StringSchemePicker; + using stats = StringStats; + using scheme = StringScheme; + using scheme_enum = StringSchemeType; + using pool = pool_t; + + static const pool& schemes() { + return CSchemePool::available_schemes->string_schemes; + } + static const std::set excluded() { + //return {}; + return std::set({ StringSchemeType::ONE_VALUE }); + } +}; + +template +using sample_t = std::tuple, std::vector>; + +template +struct Sampler { + using types = Schemes; + using type = sample_t; + using stats_t = typename types::stats; + virtual std::string name() const = 0; + virtual u32 sampled_items() const = 0; + virtual sample_t operator()(const T* input, size_t count, stats_t& stats) const = 0; +}; + +template +struct RandomSampler : Sampler { + using stats_t = typename Schemes::stats; + RandomSampler(const std::string& name, u32 sample_size, u32 sample_count) + : _name(name) + , sample_size(sample_size) + , sample_count(sample_count){} + + RandomSampler(u32 sample_size, u32 sample_count) + : _name("r" + std::to_string(sample_count) + "x" + + std::to_string(sample_size)) + , sample_size(sample_size) + , sample_count(sample_count){} + + RandomSampler() : RandomSampler(FLAGS_sample_size, FLAGS_sample_count) {} + + std::string name() const override { return _name; } + u32 sampled_items() const override { return sample_size * sample_count; } + + sample_t operator()([[maybe_unused]] const T* input, [[maybe_unused]] size_t count, stats_t& stats) const override { + FLAGS_sample_count = this->sample_count; + FLAGS_sample_size = this->sample_size; + return stats.samples(this->sample_count, this->sample_size); + } + + std::string _name; + u32 sample_size, sample_count; +}; + +template +TestResult testSampling(const std::string& filename, const std::string& bitmap_file, u8* output, PerfEvent& e) { + using types = Schemes; + //using Picker = typename types::picker; + using Stats = typename types::stats; + //using Scheme = typename types::scheme; + auto& cache = ThreadCache::get(); + auto excluded = types::excluded(); + const Vector infile(filename.c_str()); + const Vector bitmap(bitmap_file.c_str()); + + assert(infile.size() == bitmap.size()); + size_t blocksize = std::min(infile.size(), BLOCK_SIZE); + + e.setParam("file", "\"" + filename + "\""); + e.setParam("scheme", "sampling"); + e.setParam("type", types::name); + e.setParam("insize", blocksize * sizeof(T)); // TODO strings + + e.setParam("sampling", "full"); + e.setParam("sample_size", blocksize); + e.setParam("sample_count", 1); + e.setParam("sampled_items", blocksize); + Stats whole_stats = Stats::generateStats(infile.data, bitmap.data, blocksize); + for (auto& [stype, scheme] : types::schemes()) { + if (excluded.find(stype) != excluded.end()) { continue; } + u32 outsize; + e.setParam("scheme", scheme->selfDescription()); + { + PerfEventBlock blk(e, 1); + outsize = scheme->compress(infile.data, bitmap.data, output, whole_stats, FLAGS_bm_max_cascade_depth); + e.setParam("outsize", outsize); + e.setParam("compr", ((double)blocksize * sizeof(T))/((double)outsize)); + } + std::fill(output, output + outsize, 0); + } + + //std::array counts = {1, 10, 100, 1000}; + //std::array sizes = {1, 16, 64, 256, 1024, 4096}; + + // std::vector> combinations = { + // {640, 1}, {320, 2}, {160, 4}, {80, 8}, {40, 16}, + // {20, 32}, {10, 64}, {5, 128}, {1, 640}, + // }; + + //std::vector> combinations = { + //{10, 8}, {10, 16}, {10, 32}, {10, 64}, {10, 128}, {10, 256}, {10, 512}, {10, 1024}, + //{50, 8}, {50, 16}, {50, 32}, {50, 64}, {50, 128}, {50, 256}, {50, 512}, {50, 5024}, + //}; + std::vector> combinations; + // growing size + //auto sample_size = 64u; + //for (auto sample_count : {1,2,5,10,20,50,100,500,1000}) { + // combinations.push_back({sample_count, sample_size}); + //} + //// growing count + auto sample_count = 10; + for (auto sample_size : {1,2,4,8,16,32,64,128,256,512,1024,2048,4096}) { + combinations.push_back({sample_count, sample_size}); + } + + //for (auto sample_count = 1u; sample_count < (65535/sample_size)/2; sample_count *= 2) { + // for (auto sample_count : counts) { + for (auto& [sample_count, sample_size] : combinations) { + RandomSampler sampler(sample_size, sample_count); + sample_t sample_result; + sample_result = sampler(infile.data, blocksize, whole_stats); + auto& [sample, sample_nulls] = sample_result; + Stats sample_stats = Stats::generateStats( + sample.data(), sample_nulls.data(), sample.size()); + for (auto& [stype, scheme] : types::schemes()) { + if (excluded.find(stype) != excluded.end()) { + continue; + } + e.setParam("scheme", scheme->selfDescription()); + e.setParam("sampling", sampler.name()); + e.setParam("sample_size", sample_size); + e.setParam("sample_count", sample_count); + e.setParam("sampled_items", sampler.sampled_items()); + { + PerfEventBlock blk(e, 1); + auto outsize = scheme->compress(sample_stats.src, sample_stats.bitmap, output, sample_stats, FLAGS_bm_max_cascade_depth); + double estimate = CD(sample_count * sample_size * sizeof(T)) / CD(outsize); + e.setParam("compr", estimate); + } + } + //} + } + return TestResult{}; +} + +using test_fn_t = std::function; + +std::unordered_map> FILE_TYPE_MAP = { +{".double"s, {testSampling, 7}}, +{"integer"s, {testSampling, 8}}, +//{".string", testSampling}, +}; +static constexpr u8 TYPE_MAPPING_SUFFIX_LENGTH = 7; + +int main(int argc, char *argv[]) +{ + gflags::ParseCommandLineFlags(&argc, &argv, true); + spdlog::set_level(spdlog::level::warn); + CSchemePool::refresh(); + PerfEvent perf; + + InputFiles filelist(FLAGS_bm_input_file_list); + std::string s3file; + size_t bufsize = 2 * MAX_PBI_BYTES / sizeof(size_t); + size_t* scratchbuf = new size_t[bufsize]; + + for (auto i = 0; i != FLAGS_bm_skip_files;) { + filelist.next(s3file); + i += (s3file[0] != '#'); + } + int count = 0; + while (filelist.next(s3file)) { + if (s3file[0] == '#') { continue; } + if (count>FLAGS_bm_file_count) { break; } //only some files + + //std::fill(scratchbuf, scratchbuf + bufsize, 0); + + TestResult res; + + auto typestr = s3file.substr(s3file.length() - TYPE_MAPPING_SUFFIX_LENGTH, s3file.length()); + auto typedata = FILE_TYPE_MAP.find(typestr); + if (typedata == FILE_TYPE_MAP.end()) { + spdlog::debug("skipping file %s with suffix %s", s3file, typestr); + continue; + } else { + try { + ++count; + auto& [fn, substr] = typedata->second; + auto bitmap_s3 = s3file.substr(0, s3file.length() - substr) + ".bitmap"; + auto [infile, bitmap_file] = [&] { + auto f1 = std::async(std::launch::async, + [&] { return ensure_file(s3file); }); + auto f2 = std::async(std::launch::async, + [&] { return ensure_file(bitmap_s3); }); + return std::make_pair(f1.get(), f2.get()); + }(); + res = fn(infile, bitmap_file, reinterpret_cast(scratchbuf), perf); + if (FLAGS_bm_clear_filecache || count > 800) { + std::filesystem::remove(infile); + } + } catch (std::exception& e) { + std::cerr << s3file << ":" << e.what() << std::endl; + } + } + + //std::cout << res << std::endl; + + } + delete[] scratchbuf; + return 0; +} diff --git a/benchmarks/analyze_better_blocks/playground/tbb.cpp b/benchmarks/analyze_better_blocks/playground/tbb.cpp new file mode 100644 index 0000000..977a328 --- /dev/null +++ b/benchmarks/analyze_better_blocks/playground/tbb.cpp @@ -0,0 +1,36 @@ +#include "tbb/parallel_for.h" +#include "tbb/task_scheduler_init.h" +#include +#include + +struct mytask { + mytask(size_t n) + :_n(n) + {} + void operator()() { + for (int i=0;i<1000000;++i) {} // Deliberately run slow + std::cerr << "[" << _n << "]"; + } + size_t _n; +}; + +int main(int,char**) { + + //tbb::task_scheduler_init init; // Automatic number of threads + tbb::task_scheduler_init init(tbb::task_scheduler_init::default_num_threads()); // Explicit number of threads + + std::vector tasks; + for (int i=0;i<1000;++i) + tasks.push_back(mytask(i)); + + tbb::parallel_for( + tbb::blocked_range(0,tasks.size()), + [&tasks](const tbb::blocked_range& r) { + for (size_t i=r.begin();i +#include +#include +#include +#include +#include +#include +#include +#include + + +#include +#include +#include +#include +#include +#include + +/* Uncomment for S3 Crt */ +#define USES3CRT + +#ifdef USES3CRT +#include +#include +#include + +namespace s3 = Aws::S3Crt; +using s3_client_t = s3::S3CrtClient; +#else +#include +#include +namespace s3 = Aws::S3; +using s3_client_t = s3::S3Client; +#endif + + +std::mutex mutex; +std::condition_variable condition_variable; +long remaining_results; +std::vector> get_requests; +std::function&)> callback; +size_t next_index = 0; + +long num_preallocated_buffers; +long part_size; +std::vector> streambufarrays; +std::vector streambufs; +std::mutex buffer_mutex; +std::condition_variable buffer_cv; +std::unordered_set buffers_available; +std::unordered_map occupied_map; +static const char *allocation_tag = "test-s3-crt"; +static std::atomic total_downloaded_size = 0; + +static void free_buffers() { + for (auto ptr : streambufs) { + Aws::Delete(ptr); + } +} + +static void prepare_buffers() { + for (long i = 0; i < num_preallocated_buffers; i++) { + streambufarrays.emplace_back(part_size); + /* Touch every page to avoid faults later */ + for (long pos = 0; pos < part_size; pos++) { + streambufarrays[i][pos] = 0; + } + streambufs.emplace_back(Aws::New(allocation_tag, streambufarrays[i].data(), static_cast(part_size))); + // TODO we never call delete, but for now that's also not important + buffers_available.insert(i); + } +} + +static Aws::IOStream *getStream() { + Aws::IOStream *result; + { + std::unique_lock lock(buffer_mutex); + buffer_cv.wait(lock, [&] { + return !buffers_available.empty(); + }); + + long idx = *buffers_available.begin(); + buffers_available.erase(buffers_available.begin()); + result = Aws::New(allocation_tag, streambufs[idx]); + // The SDK will call delete on the stream + occupied_map[result] = idx; + } + return result; +} +Aws::IOStreamFactory response_stream_factory; + +static void releaseStream(Aws::IOStream *stream) { + { + std::lock_guard guard(buffer_mutex); + long idx = occupied_map[stream]; + occupied_map.erase(stream); + streambufs[idx] = Aws::New(allocation_tag, streambufarrays[idx].data(), static_cast(streambufarrays[idx].size())); + buffers_available.insert(idx); + } + + buffer_cv.notify_one(); +} + +static void GetObjectResponseReceiveHandler( + const s3_client_t*, + const s3::Model::GetObjectRequest& get_request, + s3::Model::GetObjectOutcome outcome, + const std::shared_ptr&) { + /* Called once the request finishes */ + if (!outcome.IsSuccess()) { + throw std::runtime_error(outcome.GetError().GetMessage()); + } + + total_downloaded_size += outcome.GetResult().GetContentLength(); + //std::cerr << get_request.GetKey() << std::endl; + long remaining; + { + std::lock_guard guard(mutex); + remaining = --remaining_results; + } + + if (remaining == 0) { + condition_variable.notify_one(); + } + + releaseStream(&(outcome.GetResult().GetBody())); +} + +static void GetObjects(const s3_client_t& s3_client, const Aws::String& fromBucket, long object_count, long object_size) +{ + for (long current_object = 0; current_object < object_count; current_object++) { + std::vector ¤t_request_vector = get_requests[next_index++]; + + long num_requests = (object_size + part_size - 1) / part_size; + current_request_vector.resize(num_requests); + + // Make request in chunks of part_size. Otherwise, we would try to store the whole file in memory. + long current_offset = 0; + for (long request_idx = 0; request_idx < num_requests; request_idx++) { + { + std::lock_guard guard(mutex); + remaining_results++; + } + //s3::Model::GetObjectRequest ¤t_request = current_request_vector[request_idx]; + static s3::Model::GetObjectRequest current_request; + current_request.SetBucket(fromBucket); + std::stringstream key; + key << object_size << "/" << current_object; + current_request.SetKey(key.str()); + + // The range is inclusive + std::stringstream range; + long last_byte = std::min(current_offset + part_size-1, object_size-1); + range << "bytes=" << current_offset << "-" << last_byte; + current_request.SetRange(range.str()); + + current_request.SetResponseStreamFactory(response_stream_factory); + + s3_client.GetObjectAsync(current_request, callback); + current_offset = last_byte + 1; + } + + if (current_offset != object_size) { + throw std::logic_error("Invalid offset value after last request"); + } + } +} + +static void usage(const char *program) { + std::cerr << "Usage: " << program << " region bucket object_count object_size part_size num_pre_allocate repetitions target_throughput" << std::endl; + exit(EXIT_FAILURE); +} + +int main(int argc, char **argv) { + if (argc != 9) { + usage(argv[0]); + } + + Aws::SDKOptions options; + Aws::InitAPI(options); + { + const Aws::String bucket_name = argv[2]; + + long object_count = std::stol(argv[3]); + if (object_count < 1) { + usage(argv[0]); + } + long object_size = std::stol(argv[4]); + if (object_size < 1) { + usage(argv[0]); + } + + part_size = std::stol(argv[5]); + if (part_size < 1) { + usage(argv[0]); + } + + num_preallocated_buffers = std::stol(argv[6]); + if (num_preallocated_buffers < 1) { + usage(argv[0]); + } + + long repetitions = std::stol(argv[7]); + if (repetitions < 1) { + usage(argv[0]); + } + + double target_throughput = std::stod(argv[8]); + if (target_throughput <= 0.0) { + usage(argv[0]); + } + + get_requests.resize(object_count * repetitions); + next_index = 0; + callback = GetObjectResponseReceiveHandler; + + const Aws::String region = argv[1]; + +#ifdef USES3CRT + s3::ClientConfiguration config; + //config.partSize = 16 * 1024 * 1024; + config.partSize = part_size; + config.throughputTargetGbps = target_throughput; + config.region = region; + // TODO maybe experiment with this one + //config.useDualStack = true; + config.scheme = Aws::Http::Scheme::HTTP; + s3_client_t s3_client(config); +#else + s3_client_t s3_client; +#endif + + prepare_buffers(); + response_stream_factory = getStream; + + auto t1 = std::chrono::high_resolution_clock::now(); + // Start all requests asynchronously + for (long rep = 0; rep < repetitions; rep++) { + GetObjects(s3_client, bucket_name, object_count, object_size); + } + // Wait until all requests are actually finished + { + std::unique_lock lock(mutex); + condition_variable.wait(lock, []{return remaining_results == 0;}); + } + auto t2 = std::chrono::high_resolution_clock::now(); + + auto us = std::chrono::duration_cast(t2 - t1); + double s = static_cast(us.count()) / static_cast(1e6); + std::size_t total_size_bytes = static_cast(object_count) * static_cast(object_size); + double total_size_gigabits = static_cast(total_size_bytes * 8) / static_cast(1<<30); + double gbps = (total_size_gigabits / s) * static_cast(repetitions); + std::cout << "total_size_bytes = " << total_size_bytes << " total_downloaded_size = " << total_downloaded_size << std::endl; + std::cout << "Speed: " << gbps << " Gbps" << std::endl; + + free_buffers(); + } + Aws::ShutdownAPI(options); +} diff --git a/benchmarks/analyze_better_blocks/playground/test-s3-custom-stream.cpp b/benchmarks/analyze_better_blocks/playground/test-s3-custom-stream.cpp new file mode 100644 index 0000000..d2dd4f5 --- /dev/null +++ b/benchmarks/analyze_better_blocks/playground/test-s3-custom-stream.cpp @@ -0,0 +1,99 @@ +// +// Created by david on 24.04.22. +// + +// +// Created by david on 21.04.22. +// + +#include +#include + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define ALLOCATION_TAG "TestTag" + +bool GetObject(const Aws::String& objectKey, + const Aws::String& fromBucket, const Aws::String& region) +{ + Aws::Client::ClientConfiguration config; + + if (!region.empty()) + { + config.region = region; + } + + Aws::S3::S3Client s3_client(config); + + Aws::S3::Model::GetObjectRequest object_request; + object_request.SetBucket(fromBucket); + object_request.SetKey(objectKey); + + size_t buffer_size = 100; + Aws::Utils::Array buffer(buffer_size); + auto stream = Aws::New("TestTag", Aws::New("TestTag", buffer.GetUnderlyingData(), buffer_size)); + object_request.SetResponseStreamFactory([&](){ + //return Aws::New>(std::ios_base::out); + return stream; + }); + + Aws::S3::Model::GetObjectOutcome get_object_outcome = + s3_client.GetObject(object_request); + + if (get_object_outcome.IsSuccess()) + { + auto& retrieved_file = get_object_outcome.GetResultWithOwnership(). + GetBody(); + + // Print a beginning portion of the text file. + std::cout << "Beginning of file contents:\n"; + char file_data[255] = { 0 }; + retrieved_file.getline(file_data, 254); + std::cout << file_data << std::endl; + + return true; + } + else + { + auto err = get_object_outcome.GetError(); + std::cout << "Error: GetObject: " << + err.GetExceptionName() << ": " << err.GetMessage() << std::endl; + + return false; + } +} + +int main(int argc, char **argv) { + Aws::SDKOptions options; + Aws::InitAPI(options); + { + const Aws::String bucket_name = "bucketprefix-public-bi-benchmark-csv"; + + //const Aws::String object_name = "aws-cpp-sdk-test.txt"; + const Aws::String object_name = "generico1/Generico_1.csv"; + + const Aws::String region = "us-east-1"; + + auto t1 = std::chrono::high_resolution_clock::now(); + bool result = GetObject(object_name, bucket_name, region); + auto t2 = std::chrono::high_resolution_clock::now(); + + auto ms_int = std::chrono::duration_cast(t2 - t1); + std::cout << "Execution took " << ms_int.count() << std::endl; + + if (!result) { + return 1; + } + } + Aws::ShutdownAPI(options); +} \ No newline at end of file diff --git a/benchmarks/analyze_better_blocks/playground/test-s3-transfer.cpp b/benchmarks/analyze_better_blocks/playground/test-s3-transfer.cpp new file mode 100644 index 0000000..8c075c9 --- /dev/null +++ b/benchmarks/analyze_better_blocks/playground/test-s3-transfer.cpp @@ -0,0 +1,93 @@ +// +// Created by david on 24.04.22. +// + +#include +#include +#include + + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static const size_t BUFFER_SIZE = 512 * 1024 * 1024; // 512MB Buffer + +bool GetObject(const Aws::String& objectKey, + const Aws::String& fromBucket, const Aws::String& region) +{ + auto s3_client = Aws::MakeShared("S3Client"); + auto executor = Aws::MakeShared("executor", 25); + Aws::Transfer::TransferManagerConfiguration transfer_config(executor.get()); + transfer_config.s3Client = s3_client; + + auto transfer_manager = Aws::Transfer::TransferManager::Create(transfer_config); + + Aws::Utils::Array buffer(BUFFER_SIZE); + auto stream = Aws::New("TestTag", Aws::New("TestTag", buffer.GetUnderlyingData(), BUFFER_SIZE)); + auto downloadHandle = transfer_manager->DownloadFile(fromBucket, + objectKey, + [&]() { //Define a lambda expression for the callback method parameter to stream back the data. + return stream; + }); + + // Try to read partial data from stream + for (int i = 0; i < 100; i++) { + std::cout << "Round " << i << std::endl; + std::cout << downloadHandle->GetStatus() << std::endl; + + auto available_bytes = stream->tellp(); + std::cout << available_bytes << std::endl; + Aws::Utils::Array read_buffer(available_bytes); + stream->read(read_buffer.GetUnderlyingData(), available_bytes); + assert(stream->gcount() == available_bytes); + + std::string result(read_buffer.GetUnderlyingData(), read_buffer.GetUnderlyingData() + stream->gcount()); + std::cout << result << std::endl; + + std::this_thread::sleep_for(std::chrono::seconds(1)); + } + + std::cout << "Waiting until download finishes." << std::endl; + downloadHandle->WaitUntilFinished();// Block calling thread until download is complete. + auto downStat = downloadHandle->GetStatus(); + if (downStat != Aws::Transfer::TransferStatus::COMPLETED) + { + auto err = downloadHandle->GetLastError(); + std::cout << "File download failed: " << err.GetMessage() << std::endl; + return false; + } + //std::cout << "File download to memory finished." << std::endl; + return true; +} + +int main(int argc, char **argv) { + Aws::SDKOptions options; + Aws::InitAPI(options); + { + const Aws::String bucket_name = "bucketprefix-public-bi-benchmark-csv"; + + //const Aws::String object_name = "aws-cpp-sdk-test.txt"; + const Aws::String object_name = "generico1/Generico_1.csv"; + + const Aws::String region = "us-east-1"; + + auto t1 = std::chrono::high_resolution_clock::now(); + bool result = GetObject(object_name, bucket_name, region); + auto t2 = std::chrono::high_resolution_clock::now(); + + auto ms_int = std::chrono::duration_cast(t2 - t1); + std::cout << "Execution took " << ms_int.count() << std::endl; + + if (!result) { + return 1; + } + } + Aws::ShutdownAPI(options); +} diff --git a/benchmarks/analyze_better_blocks/playground/test-s3.cpp b/benchmarks/analyze_better_blocks/playground/test-s3.cpp new file mode 100644 index 0000000..f48fe29 --- /dev/null +++ b/benchmarks/analyze_better_blocks/playground/test-s3.cpp @@ -0,0 +1,78 @@ +// +// Created by david on 21.04.22. +// + +#include +#include + + +#include +#include +#include + +bool GetObject(const Aws::String& objectKey, + const Aws::String& fromBucket, const Aws::String& region) +{ + Aws::Client::ClientConfiguration config; + + if (!region.empty()) + { + config.region = region; + } + + Aws::S3::S3Client s3_client(config); + + Aws::S3::Model::GetObjectRequest object_request; + object_request.SetBucket(fromBucket); + object_request.SetKey(objectKey); + + Aws::S3::Model::GetObjectOutcome get_object_outcome = + s3_client.GetObject(object_request); + + if (get_object_outcome.IsSuccess()) + { + auto& retrieved_file = get_object_outcome.GetResultWithOwnership(). + GetBody(); + + // Print a beginning portion of the text file. + std::cout << "Beginning of file contents:\n"; + char file_data[255] = { 0 }; + retrieved_file.getline(file_data, 254); + std::cout << file_data << std::endl; + + return true; + } + else + { + auto err = get_object_outcome.GetError(); + std::cout << "Error: GetObject: " << + err.GetExceptionName() << ": " << err.GetMessage() << std::endl; + + return false; + } +} + +int main(int argc, char **argv) { + Aws::SDKOptions options; + Aws::InitAPI(options); + { + const Aws::String bucket_name = "bucketprefix-public-bi-benchmark-csv"; + + //const Aws::String object_name = "aws-cpp-sdk-test.txt"; + const Aws::String object_name = "generico1/Generico_1.csv"; + + const Aws::String region = "us-east-1"; + + auto t1 = std::chrono::high_resolution_clock::now(); + bool result = GetObject(object_name, bucket_name, region); + auto t2 = std::chrono::high_resolution_clock::now(); + + auto ms_int = std::chrono::duration_cast(t2 - t1); + std::cout << "Execution took " << ms_int.count() << std::endl; + + if (!result) { + return 1; + } + } + Aws::ShutdownAPI(options); +} \ No newline at end of file diff --git a/benchmarks/analyze_better_blocks/shared-headers/Exceptions.hpp b/benchmarks/analyze_better_blocks/shared-headers/Exceptions.hpp new file mode 100644 index 0000000..15d21c5 --- /dev/null +++ b/benchmarks/analyze_better_blocks/shared-headers/Exceptions.hpp @@ -0,0 +1,23 @@ +#pragma once +#include +#include +#include +// ------------------------------------------------------------------------------------- +#define UNREACHABLE() assert(false); // TODO +#define TODO() assert(false); // TODO +// ------------------------------------------------------------------------------------- +#define die_if(expr) if (!(expr)) { perror(#expr); assert(false); } +//-------------------------------------------------------------------------------------- +#define GenericException(name) \ +struct name : public std::exception { \ + const std::string msg; \ + explicit name() \ + : msg(#name) { printf("Throwing exception: %s\n", #name); } \ + explicit name(const std::string& msg) \ + : msg(msg) { printf("Throwing exception: %s(%s)\n", #name, msg.c_str()); } \ + ~name() = default; \ + virtual const char *what() const noexcept { return msg.c_str(); } \ +}; \ +//-------------------------------------------------------------------------------------- +GenericException(Generic_Exception); +// ------------------------------------------------------------------------------------- \ No newline at end of file diff --git a/benchmarks/analyze_better_blocks/shared-headers/PerfEvent.hpp b/benchmarks/analyze_better_blocks/shared-headers/PerfEvent.hpp new file mode 100644 index 0000000..bdff486 --- /dev/null +++ b/benchmarks/analyze_better_blocks/shared-headers/PerfEvent.hpp @@ -0,0 +1,247 @@ +/* + +Copyright (c) 2018 Viktor Leis + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + */ + +#pragma once + +#if defined(__linux__) + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +struct PerfEvent { + + struct event { + struct read_format { + uint64_t value; + uint64_t time_enabled; + uint64_t time_running; + uint64_t id; + }; + + perf_event_attr pe; + int fd; + read_format prev; + read_format data; + + double readCounter() { + double multiplexingCorrection = static_cast(data.time_enabled - prev.time_enabled) / (data.time_running - prev.time_running); + return (data.value - prev.value) * multiplexingCorrection; + } + }; + + std::vector events; + std::vector names; + std::chrono::time_point startTime; + std::chrono::time_point stopTime; + std::map params; + bool printHeader; + + PerfEvent() : printHeader(true) { + registerCounter("cycle", PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES); + registerCounter("instr", PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS); + registerCounter("L1-miss", PERF_TYPE_HW_CACHE, PERF_COUNT_HW_CACHE_L1D|(PERF_COUNT_HW_CACHE_OP_READ<<8)|(PERF_COUNT_HW_CACHE_RESULT_MISS<<16)); + registerCounter("LLC-miss", PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES); + registerCounter("br-miss", PERF_TYPE_HARDWARE, PERF_COUNT_HW_BRANCH_MISSES); + registerCounter("task", PERF_TYPE_SOFTWARE, PERF_COUNT_SW_TASK_CLOCK); + // additional counters can be found in linux/perf_event.h + + for (unsigned i=0; i(stopTime - startTime).count(); + } + + double getIPC() { + return getCounter("instr") / getCounter("cycle"); + } + + double getCPUs() { + return getCounter("task") / (getDuration() * 1e9); + } + + double getGHz() { + return getCounter("cycle") / getCounter("task"); + } + + double getCounter(const std::string& name) { + for (unsigned i=0; i + static void printCounter(std::ostream& headerOut, std::ostream& dataOut, std::string name, T counterValue,bool addComma=true) { + std::stringstream stream; + stream << std::fixed << std::setprecision(2) << counterValue; + PerfEvent::printCounter(headerOut,dataOut,name,stream.str(),addComma); + } + + void printReport(std::ostream& out, uint64_t normalizationConstant) { + std::stringstream header; + std::stringstream data; + printReport(header,data,normalizationConstant); + out << header.str() << std::endl; + out << data.str() << std::endl; + } + + void printReport(std::ostream& headerOut, std::ostream& dataOut, uint64_t normalizationConstant) { + if (!events.size()) + return; + + // print all metrics + for (unsigned i=0; i + void setParam(const std::string& name,T value) { + setParam(name,std::to_string(value)); + } + + void printParams(std::ostream& header,std::ostream& data) { + for (auto& p : params) { + printCounter(header,data,p.first,p.second); + } + } +}; + +struct PerfEventBlock { + PerfEvent& e; + uint64_t scale; + + PerfEventBlock(PerfEvent& e, uint64_t scale = 1) : e(e), scale(scale) { + e.startCounters(); + } + + ~PerfEventBlock() { + e.stopCounters(); + std::stringstream header; + std::stringstream data; + e.printParams(header,data); + PerfEvent::printCounter(header,data,"time",e.getDuration()); + e.printReport(header, data, scale); + if (e.printHeader) { + std::cout << header.str() << std::endl; + e.printHeader = false; + } + std::cout << data.str() << std::endl; + } +}; + +#else +#include +struct PerfEvent { + void startCounters() {} + void stopCounters() {} + void printReport(std::ostream&, uint64_t) {} +}; +#endif diff --git a/benchmarks/analyze_better_blocks/shared-headers/PerfExternal.hpp b/benchmarks/analyze_better_blocks/shared-headers/PerfExternal.hpp new file mode 100644 index 0000000..4bc6c1a --- /dev/null +++ b/benchmarks/analyze_better_blocks/shared-headers/PerfExternal.hpp @@ -0,0 +1,45 @@ +#pragma once +#include +#include + +/** + * Use an external perf process to profile part of the program. + * Use together with the 'perf-partial' shell script. + * Can only be used once in a program together with the shell script. + * */ +struct PerfExternal { + static void start(bool print = false) { + std::string input; + std::cout << "perf_point_1" << std::endl; + std::getline(std::cin, input); + if (print) { std::cout << "perf point 1 done, read" << input << "/" << input.size() << std::endl; } + } + + static void stop(bool print = false) { + std::string input; + std::cout << "perf_point_2" << std::endl; + std::getline(std::cin, input); + if (print) { + std::cout << "perf point 2 done, read " << input << "/" << input.size() << std::endl; + } + } +}; + +struct PerfExternalBlock { + bool activate=true; + + PerfExternalBlock(bool activate) : activate(activate) { start(); } + PerfExternalBlock() { + char* e = getenv("PERF"); + activate = e != nullptr && ( + !strcmp(e, "true") || !strcmp(e, "t") || !strcmp(e, "y") || !strcmp(e, "yes") || !strcmp(e, "1") + ); + start(); + } + + ~PerfExternalBlock() { stop(); } + + private: + void start() {if (activate) { PerfExternal::start(false); }} + void stop() {if (activate) { PerfExternal::stop(false); }} +}; diff --git a/benchmarks/analyze_better_blocks/shared-headers/Reinterpret.hpp b/benchmarks/analyze_better_blocks/shared-headers/Reinterpret.hpp new file mode 100644 index 0000000..d9d605a --- /dev/null +++ b/benchmarks/analyze_better_blocks/shared-headers/Reinterpret.hpp @@ -0,0 +1,7 @@ +#pragma once +#include "Units.hpp" +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +#define RD(num) *reinterpret_cast(&num) +#define RU64(num) *reinterpret_cast(&num) +// ------------------------------------------------------------------------------------- diff --git a/benchmarks/analyze_better_blocks/shared-headers/SIMD.hpp b/benchmarks/analyze_better_blocks/shared-headers/SIMD.hpp new file mode 100644 index 0000000..090a51e --- /dev/null +++ b/benchmarks/analyze_better_blocks/shared-headers/SIMD.hpp @@ -0,0 +1,28 @@ +#pragma once + +// ---------------------------------- Not Using SIMD ---------------------------------- // +#if defined(BTR_FLAG_NO_SIMD) and BTR_FLAG_NO_SIMD +// ------------------------------------------------------------------------------------ // + +#undef BTR_USE_SIMD +#define BTR_IFSIMD(x...) +#define BTR_IFELSESIMD(a, b) b +#define SIMD_EXTRA_BYTES 0 +#define SIMD_EXTRA_ELEMENTS(TYPE) 0 + +// ------------------------------------ Using SIMD ------------------------------------ // +#else // USE_SIMD +// ------------------------------------------------------------------------------------ // + +#include + +#define BTR_IFSIMD(x...) x +#define BTR_IFELSESIMD(a, b) a +#define BTR_USE_SIMD 1 + +// SIMD instruction can become faster when they are allowed to make writes out of bounds. This spares us any out of +// bound checks and therefore many branches. The extra data simply gets overwritten or ignored. +#define SIMD_EXTRA_BYTES (sizeof(__m256i) * 4) +#define SIMD_EXTRA_ELEMENTS(TYPE) (SIMD_EXTRA_BYTES / sizeof(TYPE)) + +#endif // BTR_FLAG_NO_SIMD diff --git a/benchmarks/analyze_better_blocks/shared-headers/Units.hpp b/benchmarks/analyze_better_blocks/shared-headers/Units.hpp new file mode 100644 index 0000000..6168f33 --- /dev/null +++ b/benchmarks/analyze_better_blocks/shared-headers/Units.hpp @@ -0,0 +1,125 @@ +#pragma once +// ------------------------------------------------------------------------------------- +#include +#include +#include +#include +#include +#include +#include +#include "Exceptions.hpp" +#include "SIMD.hpp" +// ------------------------------------------------------------------------------------- +#define NULL_CODE_MARGIN 1 +#define FSST_THRESHOLD (16 * 1024) +// ------------------------------------------------------------------------------------- +using std::cerr; +using std::cout; +using std::vector; +using std::string; +using std::endl; +using std::unique_ptr; +using std::make_unique; +using std::tuple; +// ------------------------------------------------------------------------------------- +using u8 = uint8_t; +using u16 = uint16_t; +using u32 = uint32_t; +using u64 = uint64_t; +// ------------------------------------------------------------------------------------- +using s8 = int8_t; +using s16 = int16_t; +using s32 = int32_t; +using s64 = int64_t; +// ------------------------------------------------------------------------------------- +using SIZE = size_t; +// ------------------------------------------------------------------------------------- +enum class BitmapType : u8 { + ALLONES, + ALLZEROS, + REGULAR, + FLIPPED +}; +// ------------------------------------------------------------------------------------- +enum class ColumnType : u8 { + INTEGER, + DOUBLE, + STRING, + SKIP, // SKIP THIS COLUMN + // The next types are out of scope + FLOAT, + BIGINT, + SMALLINT, + UNDEFINED +}; +using TINYINT = s8; +using SMALLINT = s16; +using INTEGER = s32; // we use FOR always at the beginning so negative integers will be handled out +using UINTEGER = u32; +using DOUBLE = double; +using STRING = string; +using BITMAP = u8; +// ------------------------------------------------------------------------------------- +using str = std::string_view; +// ------------------------------------------------------------------------------------- +inline ColumnType ConvertStringToType(const string type_str) { + if (type_str == "integer") return ColumnType::INTEGER; + else if (type_str == "double") return ColumnType::DOUBLE; + else if (type_str == "string") return ColumnType::STRING; + else if (type_str == "skip") return ColumnType::SKIP; + else return ColumnType::SKIP; +}; +// ---------1---------------------------------------------------------------------------- +inline string ConvertTypeToString(const ColumnType type_str) { + if (type_str == ColumnType::INTEGER) return "integer"; + else if (type_str == ColumnType::DOUBLE) return "double"; + else if (type_str == ColumnType::STRING) return "string"; + else UNREACHABLE(); + return ""; +}; +// ------------------------------------------------------------------------------------- +using BytesArray = std::unique_ptr; +// Does not use make_unique because that zeros out the memory which is expensive. +#define makeBytesArray(size) std::unique_ptr(new u8[size]) + +// ------------------------------------------------------------------------------------- +#define TEST_DATASET(file) "test-dataset/" file "" +// ------------------------------------------------------------------------------------- +template +inline constexpr void writeRaw(u8 *base, u32 offset, T value) { + *reinterpret_cast(base + offset) = value; +} +// ------------------------------------------------------------------------------------- +template +inline constexpr T readRaw(const u8 *base, u32 offset) { + return *reinterpret_cast(base + offset); +} +// ------------------------------------------------------------------------------------- +template +inline constexpr T readRaw(u8 *base, u32 offset) { // TODO: Deprecate + return *reinterpret_cast(base + offset); +} +// ------------------------------------------------------------------------------------- +template +inline T *get_level_data(std::vector> &v, std::size_t s, std::size_t level) { + v.resize(std::max(level+1, v.size())); + v[level].resize(std::max(s, v[level].size())); + return v[level].data(); +} + +template +inline T *get_data(std::vector &v, std::size_t s) { + v.resize(std::max(s, v.size())); + return v.data(); +} +#define WRITE_RAW(base, offset, type, value) *reinterpret_cast(base + offset) = value +// ------------------------------------------------------------------------------------- +#define INT_BUFFER_SIZE 65000*4*2 +// ------------------------------------------------------------------------------------- +#define CB(enum) static_cast(enum) +#define CD(num) static_cast(num) +#define CU(num) static_cast(num) +#define CI(num) static_cast(num) +// ------------------------------------------------------------------------------------- +#define AUTO_SCHEME 255 +#define NULL_CODE 0 diff --git a/benchmarks/analyze_better_blocks/shared-headers/local.cmake b/benchmarks/analyze_better_blocks/shared-headers/local.cmake new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/test/CMakeLists.txt b/benchmarks/analyze_better_blocks/test/CMakeLists.txt new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/test/DatasetGenerator.cpp b/benchmarks/analyze_better_blocks/test/DatasetGenerator.cpp new file mode 100644 index 0000000..f975c98 --- /dev/null +++ b/benchmarks/analyze_better_blocks/test/DatasetGenerator.cpp @@ -0,0 +1,366 @@ +#include "MMapvector.hpp" +#include "Units.hpp" +// ------------------------------------------------------------------------------------- +#include "gflags/gflags.h" +// ------------------------------------------------------------------------------------- +#include +#include +#include +#include +#include +// ------------------------------------------------------------------------------------- +using namespace std; +// ------------------------------------------------------------------------------------- +DEFINE_uint64(tuple_count, 6500 * 10, ""); +DEFINE_uint32(dict_distinct_val_threshold, 10, ""); +//-------------------------------------------------------------------------------------- +void GenerateRandomString(char *dest, SIZE len, u8 entropy = 26 * 2 + 10); +// ------------------------------------------------------------------------------------- +const string out_dir_name = "test-dataset"; +// ------------------------------------------------------------------------------------- +int main(int argc, char **argv) +{ + gflags::SetUsageMessage("CSV Dataset parser"); + gflags::ParseCommandLineFlags(&argc, &argv, true); + if ( mkdir(TEST_DATASET(), S_IRWXU | S_IRWXG) && errno != EEXIST ) { + cerr << "creating output directory failed, status = " << errno << endl; + } + // ------------------------------------------------------------------------------------- + // TODO: at the moment, null values are not supported --> bitmaps are ones + vector bitmap = vector(FLAGS_tuple_count, 1); + // ------------------------------------------------------------------------------------- + // Randomness generators + std::srand(std::time(nullptr)); + // ------------------------------------------------------------------------------------- + // INTEGER + { + vector integers; + if ( mkdir(TEST_DATASET("integer"), S_IRWXU | S_IRWXG) && errno != EEXIST ) { + cerr << "creating output directory failed, status = " << errno << endl; + } + // One Value + { + integers = vector(FLAGS_tuple_count, 100); + writeBinary(TEST_DATASET("integer/ONE_VALUE.integer"), integers); + writeBinary(TEST_DATASET("integer/ONE_VALUE.bitmap"), bitmap); + integers.clear(); + } + // Truncate 8 + { + integers.push_back(std::numeric_limits::max()); + for ( u64 i = 1; i < FLAGS_tuple_count; i++ ) { + integers.push_back(std::numeric_limits::max() + (std::rand() % std::numeric_limits::max())); + } + writeBinary(TEST_DATASET("integer/TRUNCATE_8.integer"), integers); + writeBinary(TEST_DATASET("integer/TRUNCATE_8.bitmap"), bitmap); + integers.clear(); + } + // Truncate 16 + { + constexpr auto min = std::numeric_limits::max() / 2; + constexpr auto range = std::numeric_limits::max(); + integers.push_back(min); + for ( u64 i = 1; i < FLAGS_tuple_count; i++ ) { + integers.push_back(min + (std::rand() % range)); + } + writeBinary(TEST_DATASET("integer/TRUNCATE_16.integer"), integers); + writeBinary(TEST_DATASET("integer/TRUNCATE_16.bitmap"), bitmap); + integers.clear(); + } + // Dictionary 8 + { + vector distinct_values; + const u32 distinct_values_count = std::numeric_limits::max(); + { + // prepare the distinct values + set distinct_values_set; + while ( distinct_values_set.size() < distinct_values_count ) { + distinct_values_set.insert(std::rand() % std::numeric_limits::max()); + } + distinct_values = vector(distinct_values_set.begin(), distinct_values_set.end()); + } + u32 distinct_values_index = 0; + for ( u64 i = 0; i < FLAGS_tuple_count; i++ ) { + integers.push_back(distinct_values[distinct_values_index]); + distinct_values_index++; + distinct_values_index = distinct_values_index % distinct_values_count; + } + writeBinary(TEST_DATASET("integer/DICTIONARY_8.integer"), integers); + writeBinary(TEST_DATASET("integer/DICTIONARY_8.bitmap"), bitmap); + integers.clear(); + } + // Dictionary 16 + { + vector distinct_values; + const u32 distinct_values_count = std::numeric_limits::max() * 4; + { + // prepare the distinct values + set distinct_values_set; + while ( distinct_values_set.size() < distinct_values_count ) { + distinct_values_set.insert(std::rand() % std::numeric_limits::max()); + } + distinct_values = vector(distinct_values_set.begin(), distinct_values_set.end()); + } + u32 distinct_values_index = 0; + for ( u64 i = 0; i < FLAGS_tuple_count; i++ ) { + integers.push_back(distinct_values[distinct_values_index]); + distinct_values_index++; + distinct_values_index = distinct_values_index % distinct_values_count; + } + writeBinary(TEST_DATASET("integer/DICTIONARY_16.integer"), integers); + writeBinary(TEST_DATASET("integer/DICTIONARY_16.bitmap"), bitmap); + integers.clear(); + } + } + // ------------------------------------------------------------------------------------- + { + // DOUBLE + vector doubles; + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution<> dis(-999999, 999999); // pick better range + if ( mkdir(TEST_DATASET("double"), S_IRWXU | S_IRWXG) && errno != EEXIST ) { + cerr << "creating output directory failed, status = " << errno << endl; + } + // One Value + { + doubles = vector(FLAGS_tuple_count, 100.0); + writeBinary(TEST_DATASET("double/ONE_VALUE.double"), doubles); + writeBinary(TEST_DATASET("double/ONE_VALUE.bitmap"), bitmap); + doubles.clear(); + } + // Dictionary 8 + { + vector distinct_values; + const u32 distinct_values_count = std::numeric_limits::max(); + { + // prepare the distinct values + set distinct_values_set; + while ( distinct_values_set.size() < distinct_values_count ) { + distinct_values_set.insert(dis(gen)); + } + distinct_values = vector(distinct_values_set.begin(), distinct_values_set.end()); + } + u32 distinct_values_index = 0; + for ( u64 i = 0; i < FLAGS_tuple_count; i++ ) { + doubles.push_back(distinct_values[distinct_values_index]); + distinct_values_index++; + distinct_values_index = distinct_values_index % distinct_values_count; + } + writeBinary(TEST_DATASET("double/DICTIONARY_8.double"), doubles); + writeBinary(TEST_DATASET("double/DICTIONARY_8.bitmap"), bitmap); + doubles.clear(); + } + // Dictionary 16 + { + vector distinct_values; + const u32 distinct_values_count = std::numeric_limits::max() * 4; + { + // prepare the distinct values + set distinct_values_set; + while ( distinct_values_set.size() < distinct_values_count ) { + distinct_values_set.insert(dis(gen)); + } + distinct_values = vector(distinct_values_set.begin(), distinct_values_set.end()); + } + u32 distinct_values_index = 0; + for ( u64 i = 0; i < FLAGS_tuple_count; i++ ) { + doubles.push_back(distinct_values[distinct_values_index]); + distinct_values_index++; + distinct_values_index = distinct_values_index % distinct_values_count; + } + writeBinary(TEST_DATASET("double/DICTIONARY_16.double"), doubles); + writeBinary(TEST_DATASET("double/DICTIONARY_16.bitmap"), bitmap); + doubles.clear(); + } + // Random + { + doubles = vector(); + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution<> dis(-999999, 999999); // pick better range + for ( u64 i = 0; i < FLAGS_tuple_count; i++ ) { + doubles.push_back(dis(gen)); + } + writeBinary(TEST_DATASET("double/RANDOM.double"), doubles); + writeBinary(TEST_DATASET("double/RANDOM.bitmap"), bitmap); + doubles.clear(); + } + // TODO: + } + // ------------------------------------------------------------------------------------- + // STRING + { + vector strings; + if ( mkdir(TEST_DATASET("string"), S_IRWXU | S_IRWXG) && errno != EEXIST ) { + cerr << "creating output directory failed, status = " << errno << endl; + } + // One Value + { + strings = vector(FLAGS_tuple_count, "Hello Compression !"); + writeBinary(TEST_DATASET("string/ONE_VALUE.string"), strings); + writeBinary(TEST_DATASET("string/ONE_VALUE.bitmap"), bitmap); + strings.clear(); + } + // Dictionary 8 + { + vector distinct_values; + const u32 distinct_values_count = std::numeric_limits::max(); + { + // prepare the distinct values + set distinct_values_set; + while ( distinct_values_set.size() < distinct_values_count ) { + string str(10, 'a'); + GenerateRandomString(str.data(), 10); + distinct_values_set.insert(str); + } + distinct_values = vector(distinct_values_set.begin(), distinct_values_set.end()); + } + u32 distinct_values_index = 0; + for ( u64 i = 0; i < FLAGS_tuple_count; i++ ) { + strings.push_back(distinct_values[distinct_values_index]); + distinct_values_index++; + distinct_values_index = distinct_values_index % distinct_values_count; + } + writeBinary(TEST_DATASET("string/DICTIONARY_8.string"), strings); + writeBinary(TEST_DATASET("string/DICTIONARY_8.bitmap"), bitmap); + strings.clear(); + } + // WARNING: from now on, bitmap will be used + // Dictionary 16 + { + vector distinct_values; + const u32 distinct_values_count = std::numeric_limits::max() * 4; + { + // prepare the distinct values + set distinct_values_set; + while ( distinct_values_set.size() < distinct_values_count ) { + string str(10, 'a'); + GenerateRandomString(str.data(), 10); + distinct_values_set.insert(str); + } + distinct_values = vector(distinct_values_set.begin(), distinct_values_set.end()); + } + u32 distinct_values_index = 0; + for ( u64 i = 0; i < FLAGS_tuple_count; i++ ) { + if ( rand() % 10 ) { + strings.push_back(distinct_values[distinct_values_index]); + distinct_values_index++; + distinct_values_index = distinct_values_index % distinct_values_count; + } else { + strings.push_back(""); + bitmap[i] = 0; + continue; + } + } + writeBinary(TEST_DATASET("string/DICTIONARY_16.string"), strings); + writeBinary(TEST_DATASET("string/DICTIONARY_16.bitmap"), bitmap); + strings.clear(); + bitmap = vector(FLAGS_tuple_count, 1); + } + + // ------------------------------------------------------------------------------------- + // V2 Schemes starts here + // ------------------------------------------------------------------------------------- + // Integer + // ------------------------------------------------------------------------------------- + // RLE + { + vector integers; + { + integers = vector(FLAGS_tuple_count); + for ( uint32_t i = 0; i < FLAGS_tuple_count; i++ ) { + integers[i] = (rand() % 100000); + if ( rand() % 10 > 2 ) { + size_t repeat = 20; + repeat = std::min(repeat, FLAGS_tuple_count - i - 1); + for ( size_t r_i = 1; r_i <= repeat; r_i++ ) { + integers[i + r_i] = integers[i]; + } + i += repeat; + } + } + writeBinary(TEST_DATASET("integer/RLE.integer"), integers); + writeBinary(TEST_DATASET("integer/RLE.bitmap"), bitmap); + integers.clear(); + } + } + { + vector integers; + { + integers = vector(FLAGS_tuple_count); + INTEGER top_value = rand() % std::numeric_limits::max(); + for ( uint32_t i = 0; i < FLAGS_tuple_count; i++ ) { + if ( rand() % 100 > 98 ) { + integers[i] = rand(); + } else { + integers[i] = top_value; + } + } + writeBinary(TEST_DATASET("integer/FREQUENCY.integer"), integers); + writeBinary(TEST_DATASET("integer/FREQUENCY.bitmap"), bitmap); + integers.clear(); + } + } + // ------------------------------------------------------------------------------------- + // Double + { + vector doubles; + { + doubles = vector(FLAGS_tuple_count); + DOUBLE top_value = rand() % std::numeric_limits::max(); + for ( uint32_t i = 0; i < FLAGS_tuple_count; i++ ) { + if ( rand() % 100 > 98 ) { + doubles[i] = rand(); + } else { + doubles[i] = top_value; + } + } + writeBinary(TEST_DATASET("double/FREQUENCY.double"), doubles); + writeBinary(TEST_DATASET("double/FREQUENCY.bitmap"), bitmap); + doubles.clear(); + } + } + // String + // ------------------------------------------------------------------------------------- + const u32 TZT_MIN_INPUT = 200 * 1024; + // Dictionary 16 + { + vector distinct_values; + const u32 distinct_values_count = std::numeric_limits::max() / 2 - NULL_CODE_MARGIN; + const u32 string_length = TZT_MIN_INPUT / distinct_values_count; + { + // prepare the distinct values + set distinct_values_set; + while ( distinct_values_set.size() < distinct_values_count ) { + string str(string_length, 'a'); + GenerateRandomString(str.data(), string_length, 15); + distinct_values_set.insert(str); + } + distinct_values = vector(distinct_values_set.begin(), distinct_values_set.end()); + } + u32 distinct_values_index = 0; + for ( u64 i = 0; i < FLAGS_tuple_count; i++ ) { + strings.push_back(distinct_values[distinct_values_index]); + distinct_values_index++; + distinct_values_index = distinct_values_index % distinct_values_count; + } + writeBinary(TEST_DATASET("string/COMPRESSED_DICTIONARY.string"), strings); + writeBinary(TEST_DATASET("string/COMPRESSED_DICTIONARY.bitmap"), bitmap); + strings.clear(); + } + } + return 0; +} +// ------------------------------------------------------------------------------------- +void GenerateRandomString(char *dest, SIZE len, u8 entropy) +{ + const static char alphanum[] = "0123456789" + "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "abcdefghijklmnopqrstuvwxyz"; + + for ( unsigned j = 0; j < len; ++j ) { + dest[j] = alphanum[std::rand() % (entropy - 1)]; + } +} +// ------------------------------------------------------------------------------------- \ No newline at end of file diff --git a/benchmarks/analyze_better_blocks/test/test-cases/TestHelper.cpp b/benchmarks/analyze_better_blocks/test/test-cases/TestHelper.cpp new file mode 100644 index 0000000..31aa801 --- /dev/null +++ b/benchmarks/analyze_better_blocks/test/test-cases/TestHelper.cpp @@ -0,0 +1,31 @@ +#include "TestHelper.hpp" +// ------------------------------------------------------------------------------------- + +// ------------------------------------------------------------------------------------- +void TestHelper::CheckRelationCompression(Relation &relation, CMachine &compressor, const vector expected_compression_schemes) +{ + auto ranges = relation.getRanges(cengine::SplitStrategy::SEQUENTIAL, 999999); + vector compressed_chunks; + compressed_chunks.resize(ranges.size()); + for ( u32 chunk_i = 0; chunk_i < ranges.size(); chunk_i++ ) { + auto chunk = relation.getChunk(ranges, chunk_i); + auto db_meta = compressor.compress(chunk, compressed_chunks[chunk_i]); + if ( expected_compression_schemes.size()) { + //auto scheme_cnt = 0; + //for (auto e : expected_compression_schemes) { + // for (auto x : db_meta.used_compression_schemes) { + // scheme_cnt += e == x; + // } + //} + //EXPECT_EQ(scheme_cnt, expected_compression_schemes.size()); + } + ASSERT_GE(db_meta.compression_ratio, 0.95); + cout << "db_meta.compression_ratio = " << db_meta.compression_ratio << endl; + + auto decompressed_chunk = compressor.decompress(compressed_chunks[chunk_i]); + ASSERT_TRUE(decompressed_chunk == chunk); + } +} +// ------------------------------------------------------------------------------------- + +// ------------------------------------------------------------------------------------- diff --git a/benchmarks/analyze_better_blocks/test/test-cases/TestHelper.hpp b/benchmarks/analyze_better_blocks/test/test-cases/TestHelper.hpp new file mode 100644 index 0000000..1b92fed --- /dev/null +++ b/benchmarks/analyze_better_blocks/test/test-cases/TestHelper.hpp @@ -0,0 +1,13 @@ +#pragma once +#include "storage/Relation.hpp" +#include "datablock/Datablock.hpp" +// ------------------------------------------------------------------------------------- +#include "gtest/gtest.h" +// ------------------------------------------------------------------------------------- +using namespace cengine; +// ------------------------------------------------------------------------------------- +class TestHelper { +public: + static void CheckRelationCompression(Relation &relation, CMachine &compressor, const vector expected_compression_schemes = {}); +}; +// ------------------------------------------------------------------------------------- diff --git a/benchmarks/analyze_better_blocks/test/test-cases/V1.cpp b/benchmarks/analyze_better_blocks/test/test-cases/V1.cpp new file mode 100644 index 0000000..7bbf53f --- /dev/null +++ b/benchmarks/analyze_better_blocks/test/test-cases/V1.cpp @@ -0,0 +1,161 @@ +#include "storage/Relation.hpp" +#include "datablock/Datablock.hpp" +#include "TestHelper.hpp" +#include "datablock/schemes/CSchemePool.hpp" +// ------------------------------------------------------------------------------------- +#include "gtest/gtest.h" +#include "gflags/gflags.h" +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +DECLARE_uint32(force_string_scheme); +DECLARE_uint32(force_integer_scheme); +DECLARE_uint32(force_double_scheme); +// ------------------------------------------------------------------------------------- +using namespace cengine; +namespace harbook { +using namespace db; +// ------------------------------------------------------------------------------------- +TEST(V1, IntegerOneValue) +{ + Relation relation; + relation.addColumn(TEST_DATASET("integer/ONE_VALUE.integer")); + cengine::db::Datablock Datablock(relation); + TestHelper::CheckRelationCompression(relation, Datablock, {CB(IntegerSchemeType::ONE_VALUE)}); +} +// ------------------------------------------------------------------------------------- +TEST(V1, IntegerTruncate8) +{ + FLAGS_force_integer_scheme = CB(IntegerSchemeType::TRUNCATION_8); + Relation relation; + relation.addColumn(TEST_DATASET("integer/TRUNCATE_8.integer")); + cengine::db::Datablock Datablock(relation); + TestHelper::CheckRelationCompression(relation, Datablock, {CB(IntegerSchemeType::TRUNCATION_8)}); + FLAGS_force_integer_scheme = AUTO_SCHEME; +} +// ------------------------------------------------------------------------------------- +TEST(V1, IntegerTruncate16) +{ + FLAGS_force_integer_scheme = CB(IntegerSchemeType::TRUNCATION_16); + Relation relation; + relation.addColumn(TEST_DATASET("integer/TRUNCATE_16.integer")); + cengine::db::Datablock Datablock(relation); + TestHelper::CheckRelationCompression(relation, Datablock, {CB(IntegerSchemeType::TRUNCATION_16)}); + FLAGS_force_integer_scheme = AUTO_SCHEME; +} +// ------------------------------------------------------------------------------------- +TEST(V1, IntegerDictionary8) +{ + Relation relation; + relation.addColumn(TEST_DATASET("integer/DICTIONARY_8.integer")); + cengine::db::Datablock Datablock(relation); + TestHelper::CheckRelationCompression(relation, Datablock, {CB(IntegerSchemeType::DICTIONARY_8)}); +} +// ------------------------------------------------------------------------------------- +TEST(V1, IntegerDictionary16) +{ + Relation relation; + relation.addColumn(TEST_DATASET("integer/DICTIONARY_16.integer")); + cengine::db::Datablock Datablock(relation); + TestHelper::CheckRelationCompression(relation, Datablock, {CB(IntegerSchemeType::DICTIONARY_16)}); +} +// ------------------------------------------------------------------------------------- +TEST(V1, IntegerDictionary) +{ + Relation relation; + relation.addColumn(TEST_DATASET("integer/DICTIONARY_8.integer")); + relation.addColumn(TEST_DATASET("integer/DICTIONARY_16.integer")); + cengine::db::Datablock Datablock(relation); + TestHelper::CheckRelationCompression(relation, Datablock, {CB(IntegerSchemeType::DICTIONARY_8), CB(IntegerSchemeType::DICTIONARY_16)}); +} +// ------------------------------------------------------------------------------------- +TEST(V1, Integer) +{ + cengine::db::CSchemePool::refresh(); + // ------------------------------------------------------------------------------------- + Relation relation; + relation.addColumn(TEST_DATASET("integer/ONE_VALUE.integer")); + relation.addColumn(TEST_DATASET("integer/TRUNCATE_8.integer")); + relation.addColumn(TEST_DATASET("integer/TRUNCATE_16.integer")); + relation.addColumn(TEST_DATASET("integer/DICTIONARY_8.integer")); + relation.addColumn(TEST_DATASET("integer/DICTIONARY_16.integer")); + cengine::db::Datablock Datablock(relation); + TestHelper::CheckRelationCompression(relation, Datablock, + { + CB(IntegerSchemeType::ONE_VALUE), CB(IntegerSchemeType::TRUNCATION_8), CB(IntegerSchemeType::TRUNCATION_16), CB(IntegerSchemeType::DICTIONARY_8) + , CB(IntegerSchemeType::DICTIONARY_16) + }); +} +// ------------------------------------------------------------------------------------- +TEST(V1, MixedTest) +{ + Relation relation; + relation.addColumn(TEST_DATASET("integer/ONE_VALUE.integer")); + relation.addColumn(TEST_DATASET("integer/TRUNCATE_8.integer")); + relation.addColumn(TEST_DATASET("integer/TRUNCATE_16.integer")); + relation.addColumn(TEST_DATASET("integer/DICTIONARY_8.integer")); + relation.addColumn(TEST_DATASET("integer/DICTIONARY_16.integer")); + relation.addColumn(TEST_DATASET("double/ONE_VALUE.double")); + relation.addColumn(TEST_DATASET("string/ONE_VALUE.string")); + relation.addColumn(TEST_DATASET("string/DICTIONARY_8.string")); + relation.addColumn(TEST_DATASET("string/DICTIONARY_16.string")); + cengine::db::Datablock datablockV2(relation); + TestHelper::CheckRelationCompression(relation, datablockV2); +} +// ------------------------------------------------------------------------------------- +TEST(V1, DoubleOneValue) +{ + Relation relation; + relation.addColumn(TEST_DATASET("double/ONE_VALUE.double")); + cengine::db::Datablock Datablock(relation); + TestHelper::CheckRelationCompression(relation, Datablock, {CB(DoubleSchemeType::ONE_VALUE)}); +} +// ------------------------------------------------------------------------------------- +TEST(V1, DoubleDict8) +{ + FLAGS_force_double_scheme = CB(DoubleSchemeType::DICTIONARY_8); + Relation relation; + relation.addColumn(TEST_DATASET("double/DICTIONARY_8.double")); + cengine::db::Datablock Datablock(relation); + TestHelper::CheckRelationCompression(relation, Datablock, {CB(DoubleSchemeType::DICTIONARY_8)}); + FLAGS_force_double_scheme = AUTO_SCHEME; +} +// ------------------------------------------------------------------------------------- +TEST(V1, DoubleDict16) +{ + FLAGS_force_double_scheme = CB(DoubleSchemeType::DICTIONARY_16); + Relation relation; + relation.addColumn(TEST_DATASET("double/DICTIONARY_16.double")); + cengine::db::Datablock Datablock(relation); + TestHelper::CheckRelationCompression(relation, Datablock, {CB(DoubleSchemeType::DICTIONARY_16)}); + FLAGS_force_double_scheme = AUTO_SCHEME; +} +// ------------------------------------------------------------------------------------- +TEST(V1, DoubleRandom) +{ + Relation relation; + relation.addColumn(TEST_DATASET("double/RANDOM.double")); + cengine::db::Datablock Datablock(relation); + TestHelper::CheckRelationCompression(relation, Datablock, {CB(DoubleSchemeType::UNCOMPRESSED)}); +} +// ------------------------------------------------------------------------------------- +TEST(V1, StringOneValue) +{ + Relation relation; + relation.addColumn(TEST_DATASET("string/ONE_VALUE.string")); + + cengine::db::Datablock datablockV2(relation); + TestHelper::CheckRelationCompression(relation, datablockV2, {CB(StringSchemeType::ONE_VALUE)}); +} +// ------------------------------------------------------------------------------------- +TEST(V1, StringDictionary) +{ + Relation relation; + relation.addColumn(TEST_DATASET("string/DICTIONARY_8.string")); + relation.addColumn(TEST_DATASET("string/DICTIONARY_16.string")); + + cengine::db::Datablock datablockV2(relation); + TestHelper::CheckRelationCompression(relation, datablockV2, {CB(StringSchemeType::DICTIONARY_8), CB(StringSchemeType::DICTIONARY_16)}); +} +// ------------------------------------------------------------------------------------- +} +// ------------------------------------------------------------------------------------- diff --git a/benchmarks/analyze_better_blocks/test/test-cases/V2.cpp b/benchmarks/analyze_better_blocks/test/test-cases/V2.cpp new file mode 100644 index 0000000..bdb2aef --- /dev/null +++ b/benchmarks/analyze_better_blocks/test/test-cases/V2.cpp @@ -0,0 +1,110 @@ +#include "storage/Relation.hpp" +#include "datablock/Datablock.hpp" +#include "TestHelper.hpp" +#include "datablock/schemes/DoubleSchemeType.hpp" +// ------------------------------------------------------------------------------------- +#include "gtest/gtest.h" +#include "gflags/gflags.h" +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +DECLARE_uint32(force_string_scheme); +DECLARE_uint32(force_integer_scheme); +DECLARE_uint32(force_double_scheme); +#include "datablock/schemes/CSchemePool.hpp" +DECLARE_bool(db2); +// ------------------------------------------------------------------------------------- +namespace cengine { +using namespace db; +// ------------------------------------------------------------------------------------- +TEST(V2, Begin) { + FLAGS_db2 = true; + cengine::db::CSchemePool::refresh(); +} +// ------------------------------------------------------------------------------------- +TEST(V2, StringCompressedDictionary) +{ + FLAGS_force_string_scheme = CB(StringSchemeType::S_DICT); + Relation relation; + relation.addColumn(TEST_DATASET("string/COMPRESSED_DICTIONARY.string")); + cengine::db::Datablock datablockV2(relation); + TestHelper::CheckRelationCompression(relation, datablockV2, {CB(StringSchemeType::S_DICT)}); +} +// ------------------------------------------------------------------------------------- +TEST(V2, IntegerRLE) +{ + FLAGS_force_integer_scheme = CB(IntegerSchemeType::X_RLE); + Relation relation; + relation.addColumn(TEST_DATASET("integer/RLE.integer")); + cengine::db::Datablock datablockV2(relation); + TestHelper::CheckRelationCompression(relation, datablockV2, {CB(IntegerSchemeType::X_RLE)}); +} +// ------------------------------------------------------------------------------------- +TEST(V2, DoubleRLE) +{ + FLAGS_force_double_scheme = CB(DoubleSchemeType::X_RLE); + Relation relation; + relation.addColumn(TEST_DATASET("double/RANDOM.double")); + cengine::db::Datablock datablockV2(relation); + TestHelper::CheckRelationCompression(relation, datablockV2, {CB(DoubleSchemeType::X_RLE)}); + FLAGS_force_double_scheme = AUTO_SCHEME; +} +// ------------------------------------------------------------------------------------- +TEST(V2, IntegerDyanmicDict) +{ + FLAGS_force_integer_scheme = CB(IntegerSchemeType::X_DICT); + Relation relation; + relation.addColumn(TEST_DATASET("integer/DICTIONARY_16.integer")); + cengine::db::Datablock datablockV2(relation); + TestHelper::CheckRelationCompression(relation, datablockV2, {CB(IntegerSchemeType::X_DICT)}); + FLAGS_force_integer_scheme = AUTO_SCHEME; +} +// ------------------------------------------------------------------------------------- +TEST(V2, DoubleDecimal) +{ + FLAGS_force_double_scheme = CB(DoubleSchemeType::X_DECIMAL); + Relation relation; + relation.addColumn(TEST_DATASET("double/DICTIONARY_8.double")); + cengine::db::Datablock datablockV2(relation); + TestHelper::CheckRelationCompression(relation, datablockV2, {CB(DoubleSchemeType::X_DECIMAL)}); + FLAGS_force_double_scheme = AUTO_SCHEME; +} +// ------------------------------------------------------------------------------------- +TEST(V2, DoubleDyanmicDict) +{ + FLAGS_force_double_scheme = CB(DoubleSchemeType::X_DICT); + Relation relation; + relation.addColumn(TEST_DATASET("double/DICTIONARY_8.double")); + cengine::db::Datablock datablockV2(relation); + TestHelper::CheckRelationCompression(relation, datablockV2, {CB(DoubleSchemeType::X_DICT)}); + FLAGS_force_double_scheme = AUTO_SCHEME; +} +// ------------------------------------------------------------------------------------- +// TEST(V2, IntegerFrequency) +// { +// FLAGS_force_integer_scheme = CB(IntegerSchemeType::X_FREQUENCY); +// Relation relation; +// relation.addColumn(TEST_DATASET("integer/FREQUENCY.integer")); +// cengine::db::Datablock datablockV2(relation); +// TestHelper::CheckRelationCompression(relation, datablockV2, {CB(IntegerSchemeType::X_FREQUENCY)}); +// FLAGS_force_integer_scheme = AUTO_SCHEME; +// } +// ------------------------------------------------------------------------------------- +// scheme is disabled +TEST(V2, DoubleFrequency) +{ + FLAGS_force_double_scheme = CB(DoubleSchemeType::X_FREQUENCY); + Relation relation; + relation.addColumn(TEST_DATASET("double/FREQUENCY.double")); + cengine::db::Datablock datablockV2(relation); + TestHelper::CheckRelationCompression(relation, datablockV2, {CB(DoubleSchemeType::X_FREQUENCY)}); + FLAGS_force_double_scheme = AUTO_SCHEME; +} +// ------------------------------------------------------------------------------------- +TEST(V2, End) +{ + FLAGS_db2=false; + cengine::db::CSchemePool::refresh(); +} +// ------------------------------------------------------------------------------------- +} +// ------------------------------------------------------------------------------------- diff --git a/benchmarks/analyze_better_blocks/test/tester.cpp b/benchmarks/analyze_better_blocks/test/tester.cpp new file mode 100644 index 0000000..fc8ba19 --- /dev/null +++ b/benchmarks/analyze_better_blocks/test/tester.cpp @@ -0,0 +1,25 @@ +// --------------------------------------------------------------------------- +// cengine +// --------------------------------------------------------------------------- +#include +#include "gtest/gtest.h" +#include "gflags/gflags.h" +#include "datablock/schemes/CSchemePool.hpp" +#include "SIMD.hpp" +// --------------------------------------------------------------------------- +DEFINE_string(fsst_stats, "", ""); +// --------------------------------------------------------------------------- +int main(int argc, char *argv[]) +{ + #ifdef BTR_USE_SIMD + std::cout << "\033[0;35mSIMD ENABLED\033[0m" << std::endl; + #else + std::cout << "\033[0;31mSIMD DISABLED\033[0m" << std::endl; + #endif + testing::InitGoogleTest(&argc, argv); + // ------------------------------------------------------------------------------------- + gflags::ParseCommandLineFlags(&argc, &argv, true); + cengine::db::CSchemePool::available_schemes = make_unique(); + return RUN_ALL_TESTS(); +} +// --------------------------------------------------------------------------- diff --git a/benchmarks/analyze_better_blocks/tools/CMakeLists.txt b/benchmarks/analyze_better_blocks/tools/CMakeLists.txt new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/tools/analysis/r-scripts/dataset_distribution.r b/benchmarks/analyze_better_blocks/tools/analysis/r-scripts/dataset_distribution.r new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/tools/analysis/r-scripts/dataset_distribution_compressed.r b/benchmarks/analyze_better_blocks/tools/analysis/r-scripts/dataset_distribution_compressed.r new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/tools/analysis/r-scripts/estimation_deviation.r b/benchmarks/analyze_better_blocks/tools/analysis/r-scripts/estimation_deviation.r new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/tools/analysis/r-scripts/overnight.r b/benchmarks/analyze_better_blocks/tools/analysis/r-scripts/overnight.r new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/tools/analysis/r-scripts/sample_parameters.r b/benchmarks/analyze_better_blocks/tools/analysis/r-scripts/sample_parameters.r new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/tools/analysis/r-scripts/schemes_plugging.csv b/benchmarks/analyze_better_blocks/tools/analysis/r-scripts/schemes_plugging.csv new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/tools/analysis/r-scripts/schemes_plugging.r b/benchmarks/analyze_better_blocks/tools/analysis/r-scripts/schemes_plugging.r new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/tools/analysis/r-scripts/total_compression_factor.r b/benchmarks/analyze_better_blocks/tools/analysis/r-scripts/total_compression_factor.r new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/tools/analysis/r-scripts/total_compression_factor.tsv b/benchmarks/analyze_better_blocks/tools/analysis/r-scripts/total_compression_factor.tsv new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/tools/analysis/r-scripts/tzt.r b/benchmarks/analyze_better_blocks/tools/analysis/r-scripts/tzt.r new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/tools/analysis/r-scripts/varying_block_size.csv b/benchmarks/analyze_better_blocks/tools/analysis/r-scripts/varying_block_size.csv new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/tools/analysis/r-scripts/varying_block_size.r b/benchmarks/analyze_better_blocks/tools/analysis/r-scripts/varying_block_size.r new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/tools/analysis/r-scripts/varying_sample_parameter.csv b/benchmarks/analyze_better_blocks/tools/analysis/r-scripts/varying_sample_parameter.csv new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/tools/conversion/CMakeLists.txt b/benchmarks/analyze_better_blocks/tools/conversion/CMakeLists.txt new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/tools/conversion/PerfEvent.hpp b/benchmarks/analyze_better_blocks/tools/conversion/PerfEvent.hpp new file mode 100644 index 0000000..6ce654b --- /dev/null +++ b/benchmarks/analyze_better_blocks/tools/conversion/PerfEvent.hpp @@ -0,0 +1,268 @@ +/* + +Copyright (c) 2018 Viktor Leis + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + */ + +#pragma once + +#if defined(__linux__) + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +struct PerfEvent { + + struct event { + struct read_format { + uint64_t value; + uint64_t time_enabled; + uint64_t time_running; + uint64_t id; + }; + + perf_event_attr pe; + int fd; + read_format prev; + read_format data; + + double readCounter() { + double multiplexingCorrection = static_cast(data.time_enabled - prev.time_enabled) / static_cast(data.time_running - prev.time_running); + return static_cast(data.value - prev.value) * multiplexingCorrection; + } + }; + + std::vector events; + std::vector names; + std::chrono::time_point startTime; + std::chrono::time_point stopTime; + + PerfEvent() { + registerCounter("cycles", PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES); + registerCounter("instructions", PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS); + registerCounter("L1-misses", PERF_TYPE_HW_CACHE, PERF_COUNT_HW_CACHE_L1D|(PERF_COUNT_HW_CACHE_OP_READ<<8)|(PERF_COUNT_HW_CACHE_RESULT_MISS<<16)); + registerCounter("LLC-misses", PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES); + registerCounter("branch-misses", PERF_TYPE_HARDWARE, PERF_COUNT_HW_BRANCH_MISSES); + registerCounter("task-clock", PERF_TYPE_SOFTWARE, PERF_COUNT_SW_TASK_CLOCK); + // additional counters can be found in linux/perf_event.h + + for (unsigned i=0; i(syscall(__NR_perf_event_open, &event.pe, 0, -1, -1, 0)); + if (event.fd < 0) { + std::cerr << "Error opening counter " << names[i] << std::endl; + events.resize(0); + names.resize(0); + return; + } + } + } + + void registerCounter(const std::string& name, uint64_t type, uint64_t eventID) { + names.push_back(name); + events.push_back(event()); + auto& event = events.back(); + auto& pe = event.pe; + memset(&pe, 0, sizeof(struct perf_event_attr)); + pe.type = static_cast(type); + pe.size = sizeof(struct perf_event_attr); + pe.config = eventID; + pe.disabled = true; + pe.inherit = 1; + pe.inherit_stat = 0; + pe.exclude_kernel = false; + pe.exclude_hv = false; + pe.read_format = PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING; + } + + void startCounters() { + for (unsigned i=0; i(stopTime - startTime).count(); + } + + double getIPC() { + return getCounter("instructions") / getCounter("cycles"); + } + + double getCPUs() { + return getCounter("task-clock") / (getDuration() * 1e9); + } + + double getGHz() { + return getCounter("cycles") / getCounter("task-clock"); + } + + double getCounter(const std::string& name) { + for (unsigned i=0; i(width)) << name << (addComma ? "," : "") << " "; + dataOut << std::setw(static_cast(width)) << counterValue << (addComma ? "," : "") << " "; + } + + template + static void printCounter(std::ostream& headerOut, std::ostream& dataOut, std::string name, T counterValue,bool addComma=true) { + std::stringstream stream; + stream << std::fixed << std::setprecision(2) << counterValue; + PerfEvent::printCounter(headerOut,dataOut,name,stream.str(),addComma); + } + + void printReport(std::ostream& out, uint64_t normalizationConstant) { + std::stringstream header; + std::stringstream data; + printReport(header,data,normalizationConstant); + out << header.str() << std::endl; + out << data.str() << std::endl; + } + + void printReport(std::ostream& headerOut, std::ostream& dataOut, uint64_t normalizationConstant) { + if (!events.size()) + return; + + // print all metrics + for (unsigned i=0; i(normalizationConstant)); + } + + printCounter(headerOut,dataOut,"scale",normalizationConstant); + + // derived metrics + printCounter(headerOut,dataOut,"IPC",getIPC()); + printCounter(headerOut,dataOut,"CPUs",getCPUs()); + printCounter(headerOut,dataOut,"GHz",getGHz(),false); + } +}; + +struct BenchmarkParameters { + + void setParam(const std::string& name,const std::string& value) { + params[name]=value; + } + + void setParam(const std::string& name,const char* value) { + params[name]=value; + } + + template + void setParam(const std::string& name,T value) { + setParam(name,std::to_string(value)); + } + + void printParams(std::ostream& header,std::ostream& data) { + for (auto& p : params) { + PerfEvent::printCounter(header,data,p.first,p.second); + } + } + + BenchmarkParameters(std::string name="") { + if (name.length()) + setParam("name",name); + } + + private: + std::map params; +}; + +struct PerfEventBlock { + PerfEvent e; + uint64_t scale; + BenchmarkParameters parameters; + bool printHeader; + + PerfEventBlock(uint64_t scale = 1, BenchmarkParameters params = {}, bool printHeader = true) + : scale(scale), + parameters(params), + printHeader(printHeader) { + e.startCounters(); + } + + ~PerfEventBlock() { + e.stopCounters(); + std::stringstream header; + std::stringstream data; + parameters.printParams(header,data); + PerfEvent::printCounter(header,data,"time sec",e.getDuration()); + e.printReport(header, data, scale); + if (printHeader) + std::cout << header.str() << std::endl; + std::cout << data.str() << std::endl; + } +}; + +#else +#include +struct PerfEvent { + void startCounters() {} + void stopCounters() {} + void printReport(std::ostream&, uint64_t) {} + template void setParam(const std::string&, const T&) {}; +}; + +struct BenchmarkParameters { +}; + +struct PerfEventBlock { + PerfEventBlock(uint64_t = 1, BenchmarkParameters = {}, bool = true) {}; + PerfEventBlock(PerfEvent e, uint64_t = 1, BenchmarkParameters = {}, bool = true) {}; +}; +#endif diff --git a/benchmarks/analyze_better_blocks/tools/conversion/btrmeta.cpp b/benchmarks/analyze_better_blocks/tools/conversion/btrmeta.cpp new file mode 100644 index 0000000..4de0544 --- /dev/null +++ b/benchmarks/analyze_better_blocks/tools/conversion/btrmeta.cpp @@ -0,0 +1,97 @@ +// +// Created by david on 29.04.22. +// +#include +#include +#include +#include +#include + +#include "gflags/gflags.h" + +#include "datablock/BtrReader.hpp" +#include "datablock/schemes/CSchemePool.hpp" +#include "utils/Utils.hpp" + +DEFINE_string(btr, "btr", "Directory with btr input"); +DEFINE_int32(chunk, -1, "Chunk to inspect"); +DEFINE_int32(column, -1, "Column to inspect"); +DEFINE_string(fsst_stats, "", ""); // unused, defined to make linker not break + +int main(int argc, char **argv) { + gflags::ParseCommandLineFlags(&argc, &argv, true); + std::filesystem::path btr_dir = FLAGS_btr; + cengine::db::CSchemePool::refresh(); + + std::vector raw_file_metadata; + const cengine::db::FileMetadata *file_metadata; + { + auto metadata_path = btr_dir / "metadata"; + std::ifstream file(metadata_path, std::ios::binary | std::ios::ate); + std::streamsize filesize = file.tellg(); + file.seekg(0, std::ios::beg); + raw_file_metadata.resize(filesize); + file.read(raw_file_metadata.data(), filesize); + if (file.bad()) { + throw Generic_Exception("Reading metadata failed"); + } + file_metadata = reinterpret_cast(raw_file_metadata.data()); + } + + vector columns; + if (FLAGS_column != -1) { + columns.push_back(FLAGS_column); + } else { + u32 num_columns = file_metadata->num_columns; + columns.resize(num_columns); + std::iota(columns.begin(), columns.end(), 0); + } + + std::unordered_map>>> m; + + u32 total_tuples = 0; + u32 chunk_i = 0; + for (u32 column_i : columns) { + for (u32 part_i = 0; part_i < file_metadata->parts[column_i].num_parts; part_i++) { + thread_local std::vector compressed_data; + auto path = btr_dir / ("column" + std::to_string(column_i) + "_part" + std::to_string(part_i)); + cengine::Utils::readFileToMemory(path.string(), compressed_data); + cengine::db::BtrReader reader(compressed_data.data()); + for (u32 part_chunk_i = 0; part_chunk_i < reader.getChunkCount(); part_chunk_i++) { + if (FLAGS_chunk == -1 || FLAGS_chunk == chunk_i) { + total_tuples += reader.getTupleCount(part_chunk_i); + auto scheme_description = reader.getSchemeDescription(part_chunk_i); + if (m.count(column_i) == 0) { + m[column_i] = {reader.getColumnType(), {}}; + } + + if (m[column_i].second.count(scheme_description) == 0) { + // First time this scheme occurs + m[column_i].second[scheme_description] = {}; + } + m[column_i].second[scheme_description].push_back(chunk_i); + } + chunk_i++; + } + } + } + + // General info + std::cout << "Looked at " << (FLAGS_chunk == -1 ? file_metadata->num_chunks : 1) << " chunks with " << columns.size() << " columns, a total of " << total_tuples << " tuples" << std::endl; + std::cout << std::endl; + for (auto &[column, p] : m) { + auto &[type, scheme_map] = p; + std::cout << "Column " << column << " with type " << ConvertTypeToString(type) << ":" << std::endl; + for (auto &[scheme, chunks] : scheme_map) { + std::sort(chunks.begin(), chunks.end()); + std::cout << chunks.size() << " chunks [ "; + for (auto chunk : chunks) { + std::cout << chunk << " "; + } + std::cout << "] use" << std::endl; + std::cout << scheme << std::endl; + } + std::cout << std::endl; + } + return 0; +} diff --git a/benchmarks/analyze_better_blocks/tools/conversion/btrtocsv.cpp b/benchmarks/analyze_better_blocks/tools/conversion/btrtocsv.cpp new file mode 100644 index 0000000..5c1f8f4 --- /dev/null +++ b/benchmarks/analyze_better_blocks/tools/conversion/btrtocsv.cpp @@ -0,0 +1,167 @@ +// +// Created by david on 14.04.22. +// + +// Standard libs +#include +#include + +// External libs +#include "gflags/gflags.h" +#include "yaml-cpp/yaml.h" +#include "spdlog/spdlog.h" +#include "tbb/parallel_for.h" +#include "tbb/task_scheduler_init.h" + +// Btr internal includes +#include "storage/Relation.hpp" +#include "datablock/schemes/CSchemePool.hpp" +#include "datablock/Datablock.hpp" +#include "datablock/cache/ThreadCache.hpp" +#include "parser/Parser.hpp" +#include "analysis/Analysis.hpp" +#include "datablock/BtrReader.hpp" +#include "storage/StringPointerArrayViewer.hpp" +#include "utils/Utils.hpp" + +DEFINE_string(btr, "btr", "Directory with btr input"); +// TODO we need to get rid of the schema here. It MUST be possible to read the files in without it, or generate it from the files. +DEFINE_string(csv, "data.csv", "File for CSV output"); +DEFINE_uint32(threads, 8, ""); +// This one actually does nothing, but without it linking will fail. +DEFINE_string(fsst_stats, "", ""); + +void outputChunk(std::ofstream &csvstream, u32 tuple_count, + const std::vector> &counters, + const std::vector> &decompressed_columns, + std::vector> &readers, + const std::vector& requires_copy) { + const std::string field_separator = "|"; + const std::string line_separator = "\n"; + + for (size_t row = 0; row < tuple_count; row++) { + for (size_t col = 0; col < decompressed_columns.size(); col++) { + cengine::db::BtrReader &reader = readers[col][counters[col].first]; + BitmapWrapper *nullmap = reader.getBitmap(counters[col].second-1); + + bool is_null; + if (nullmap->type() == BitmapType::ALLZEROS) { + is_null = true; + } else if (nullmap->type() == BitmapType::ALLONES) { + is_null = false; + } else { + is_null = !(nullmap->get_bitset()->test(row)); + } + + if (!is_null) { + switch (reader.getColumnType()) { + case ColumnType::INTEGER: { + auto int_array = reinterpret_cast(decompressed_columns[col].data()); + csvstream << int_array[row]; + break; + } + case ColumnType::DOUBLE: { + auto double_array = reinterpret_cast(decompressed_columns[col].data()); + csvstream << double_array[row]; + break; + } + case ColumnType::STRING: { + std::string data; + if (requires_copy[col]) { + auto string_pointer_array_viewer = cengine::StringPointerArrayViewer(reinterpret_cast(decompressed_columns[col].data())); + data = string_pointer_array_viewer(row); + } else { + auto string_array_viewer = cengine::StringArrayViewer(reinterpret_cast(decompressed_columns[col].data())); + data = string_array_viewer(row); + } + csvstream << data; + break; + } + default: { + throw Generic_Exception( + "Type " + ConvertTypeToString(reader.getColumnType()) + " not supported"); + } + } + } else { + csvstream << "null"; + } + + if (col+1 != decompressed_columns.size()) { + csvstream << field_separator; + } + } + csvstream << line_separator; + } +} + +int main(int argc, char **argv) +{ + gflags::ParseCommandLineFlags(&argc, &argv, true); + std::filesystem::path btr_dir = FLAGS_btr; + + // This seems necessary to be + cengine::db::CSchemePool::refresh(); + + // Init TBB TODO: is that actually still necessary ? + tbb::task_scheduler_init init(FLAGS_threads); // NOLINT(cppcoreguidelines-narrowing-conversions) + + // Open output file + auto csvstream = ofstream(FLAGS_csv); + csvstream << std::setprecision(32); + + // Read the metadata + std::vector raw_file_metadata; + const cengine::db::FileMetadata *file_metadata; + { + auto metadata_path = btr_dir / "metadata"; + cengine::Utils::readFileToMemory(metadata_path.string(), raw_file_metadata); + file_metadata = reinterpret_cast(raw_file_metadata.data()); + } + + // Prepare the readers + std::vector> readers(file_metadata->num_columns); + std::vector>> compressed_data(file_metadata->num_columns); + tbb::parallel_for(u32(0), file_metadata->num_columns, [&](u32 column_i) { + compressed_data[column_i].resize(file_metadata->parts[column_i].num_parts); + for (u32 part_i = 0; part_i < file_metadata->parts[column_i].num_parts; part_i++) { + auto path = btr_dir / ("column" + std::to_string(column_i) + "_part" + std::to_string(part_i)); + cengine::Utils::readFileToMemory(path.string(), compressed_data[column_i][part_i]); + readers[column_i].emplace_back(compressed_data[column_i][part_i].data()); + } + }); + + // For each column counters contains a pair of + std::vector> counters(file_metadata->num_columns, {0, 0}); + + for (u32 chunk_i = 0; chunk_i < file_metadata->num_chunks; chunk_i++) { + std::vector> outputs(file_metadata->num_columns); + /* + * Intuitively we would use vector for requires_copy. However the + * stdlib may implement vector as a bitset. When reading and + * writing and operating won't just touch the current bit. It will also + * touch all the bits around it (in the same byte). This leads to + * contention and ultimately wrong values. Using another integral type + * vector solves the contention problem. + */ + std::vector requires_copy(file_metadata->num_columns); + u32 tuple_count = 0; + tbb::parallel_for(u32(0), file_metadata->num_columns, [&](u32 column_i) { + u32 part_i = counters[column_i].first; + cengine::db::BtrReader &reader = readers[column_i][part_i]; + if (counters[column_i].second >= reader.getChunkCount()) { + counters[column_i].first++; + part_i++; + counters[column_i].second = 0; + reader = readers[column_i][part_i]; + } + + u32 part_chunk_i = counters[column_i].second; + if (column_i == 0) { + tuple_count = reader.getTupleCount(part_chunk_i); + } + requires_copy[column_i] = reader.readColumn(outputs[column_i], part_chunk_i); + counters[column_i].second++; + }); + outputChunk(csvstream, tuple_count, counters, outputs, readers, requires_copy); + } +} diff --git a/benchmarks/analyze_better_blocks/tools/conversion/compare_csvs.py b/benchmarks/analyze_better_blocks/tools/conversion/compare_csvs.py new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/tools/conversion/csvtobtr.cpp b/benchmarks/analyze_better_blocks/tools/conversion/csvtobtr.cpp new file mode 100644 index 0000000..755eac6 --- /dev/null +++ b/benchmarks/analyze_better_blocks/tools/conversion/csvtobtr.cpp @@ -0,0 +1,214 @@ +// +// Created by david on 13.04.22. +// +// This program takes a csv file and its yaml schema and converts it to btr blocks +// +// Example call: csvtobtr TODO +// + +// Standard libs +#include +#include + +// External libs +#include "gflags/gflags.h" +#include "yaml-cpp/yaml.h" +#include "spdlog/spdlog.h" +#include "tbb/parallel_for.h" +#include "tbb/task_scheduler_init.h" + +// Btr internal includes +#include "storage/Relation.hpp" +#include "datablock/schemes/CSchemePool.hpp" +#include "datablock/Datablock.hpp" +#include "datablock/cache/ThreadCache.hpp" +#include "parser/Parser.hpp" +#include "analysis/Analysis.hpp" +#include "datablock/BtrReader.hpp" +#include "utils/Utils.hpp" + + +// Define command line flags +// TODO make the required flags mandatory/positional +DEFINE_string(btr, "btr", "Directory for btr output"); +DEFINE_string(binary, "binary", "Directory for binary output"); +DEFINE_string(yaml, "schema.yaml", "Schema in YAML format"); +DEFINE_string(csv, "data.csv", "Original data in CSV format"); +DEFINE_string(stats, "stats.txt", "File where stats are being stored"); +DEFINE_string(compressionout,"compressionout.txt", "File where compressin times are being stored"); +DEFINE_string(typefilter, "", "Only include columns with the selected type"); +DEFINE_bool(create_binary, false, "Set if binary files are supposed to be created"); +DEFINE_bool(create_btr, false, "If false will exit after binary creation"); +DEFINE_bool(verify, true, "Verify that decompression works"); +DEFINE_int32(chunk, -1, "Select a specific chunk to measure"); +DEFINE_int32(column, -1, "Select a specific column to measure"); +DEFINE_uint32(threads, 8, ""); +// This one actually does nothing, but without it linking will fail. +DEFINE_string(fsst_stats, "", ""); + +void verify_or_die(const std::string& filename, const std::vector &input_chunks) { + if (!FLAGS_verify) { + return; + } + // Verify that decompression works + thread_local std::vector compressed_data; + cengine::Utils::readFileToMemory(filename, compressed_data); + cengine::db::BtrReader reader(compressed_data.data()); + for (SIZE chunk_i = 0; chunk_i < reader.getChunkCount(); chunk_i++) { + std::vector output(reader.getDecompressedSize(chunk_i)); + bool requires_copy = reader.readColumn(output, chunk_i); + if (!input_chunks[chunk_i].compareContents(output.data(), reader.getBitmap(chunk_i), reader.getTupleCount(chunk_i), + requires_copy)) { + throw Generic_Exception("Decompression yields different contents"); + } + } +} + +int main(int argc, char **argv) +{ + gflags::ParseCommandLineFlags(&argc, &argv, true); + std::string binary_path = FLAGS_binary + "/"; + // This seems necessary to be + cengine::db::CSchemePool::refresh(); + + // Init TBB TODO: is that actually still necessary ? + tbb::task_scheduler_init init(FLAGS_threads); + + // Load schema + const auto schema = YAML::LoadFile(FLAGS_yaml); + + uint64_t binary_creation_time = 0; + if (FLAGS_create_binary) { + spdlog::info("Creating binary files in " + FLAGS_binary); + // Load and parse CSV + auto start_time = std::chrono::steady_clock::now(); + std::ifstream csv(FLAGS_csv); + if (!csv.good()) { + throw Generic_Exception("Unable to open specified csv file"); + } + // parse writes the binary files + cengine::Parser::parse(FLAGS_csv, schema, FLAGS_binary); + auto end_time = std::chrono::steady_clock::now(); + binary_creation_time = std::chrono::duration_cast(end_time - start_time).count(); + } + + if (!FLAGS_create_btr) { + return 0; + } + + spdlog::info("Creating btr files in " + FLAGS_btr); + + + ColumnType typefilter; + if (FLAGS_typefilter.empty()) { + typefilter = ColumnType::UNDEFINED; + } else if (FLAGS_typefilter == "integer") { + typefilter = ColumnType::INTEGER; + } else if (FLAGS_typefilter == "double") { + typefilter = ColumnType::DOUBLE; + } else if (FLAGS_typefilter == "string") { + typefilter = ColumnType::STRING; + } else { + throw std::runtime_error("typefilter must be one of [integer, double, string]"); + } + + if (typefilter != ColumnType::UNDEFINED) { + spdlog::info("Only considering columns with type " + FLAGS_typefilter); + } + + // Create relation + cengine::Relation relation = cengine::Relation(schema, FLAGS_binary); + std::filesystem::path yaml_path = FLAGS_yaml; + relation.name = yaml_path.stem(); + + // Prepare datastructures for btr compression + //auto ranges = relation.getRanges(static_cast(1), 9999); + auto ranges = relation.getRanges(cengine::SplitStrategy::SEQUENTIAL, 9999); + assert(ranges.size() > 0); + cengine::db::Datablock datablockV2(relation); + std::filesystem::create_directory(FLAGS_btr); +// if (!std::filesystem::create_directory(FLAGS_btr)) { +// throw Generic_Exception("Unable to create btr output directory"); +// } + + // These counter are for statistics that match the harbook. + std::vector sizes_uncompressed(relation.columns.size()); + std::vector sizes_compressed(relation.columns.size()); + std::vector part_counters(relation.columns.size()); + std::vector types(relation.columns.size()); + + // TODO run in parallel over individual columns and handle chunks inside + // TODO collect statistics for overall metadata like + // - total tuple count + // - for every column: total number of parts + // - for every column: name, type + // TODO chunk flag + auto start_time = std::chrono::steady_clock::now(); + tbb::parallel_for(SIZE(0), relation.columns.size(), [&](SIZE column_i) { + types[column_i] = relation.columns[column_i].type; + if (typefilter != ColumnType::UNDEFINED && typefilter != types[column_i]) { + return; + } + if (FLAGS_column != -1 && FLAGS_column != column_i) { + return; + } + + std::vector input_chunks; + std::string path_prefix = FLAGS_btr + "/" + "column" + std::to_string(column_i) + "_part"; + cengine::ColumnPart part; + for (SIZE chunk_i = 0; chunk_i < ranges.size(); chunk_i++) { + if (FLAGS_chunk != -1 && FLAGS_chunk != chunk_i) { + continue; + } + + auto input_chunk = relation.getInputChunk(ranges[chunk_i], chunk_i, column_i); + std::vector data = cengine::db::Datablock::compress(input_chunk); + sizes_uncompressed[column_i] += input_chunk.size; + + if (!part.canAdd(data.size())) { + std::string filename = path_prefix + to_string(part_counters[column_i]); + sizes_compressed[column_i] += part.writeToDisk(filename); + part_counters[column_i]++; + verify_or_die(filename, input_chunks); + input_chunks.clear(); + } + + input_chunks.push_back(std::move(input_chunk)); + part.addCompressedChunk(std::move(data)); + } + + if (!part.chunks.empty()) { + std::string filename = path_prefix + to_string(part_counters[column_i]); + sizes_compressed[column_i] += part.writeToDisk(filename); + part_counters[column_i]++; + verify_or_die(filename, input_chunks); + input_chunks.clear(); + } + }); + + cengine::db::Datablock::writeMetadata(FLAGS_btr + "/metadata", types, part_counters, ranges.size()); + std::ofstream stats_stream(FLAGS_stats); + size_t total_uncompressed = 0; + size_t total_compressed = 0; + stats_stream << "Column,uncompressed,compressed\n"; + for (SIZE col=0; col < relation.columns.size(); col++) { + total_uncompressed += sizes_uncompressed[col]; + total_compressed += sizes_compressed[col]; + + stats_stream << relation.columns[col].name << "," << sizes_uncompressed[col] << "," << sizes_compressed[col] << "\n"; + } + auto end_time = std::chrono::steady_clock::now(); + uint64_t btr_creation_time = std::chrono::duration_cast(end_time - start_time).count(); + + stats_stream << "Total," << total_uncompressed << "," << total_compressed << std::endl; + + std::ofstream compressionout_stream(FLAGS_compressionout); + double binary_creation_time_seconds = static_cast(binary_creation_time) / 1e6; + double btr_creation_time_seconds = static_cast(btr_creation_time) / 1e6; + + compressionout_stream << "binary: " << binary_creation_time_seconds + << " btr: " << btr_creation_time_seconds + << " total: " << (binary_creation_time_seconds + btr_creation_time_seconds) + << " verify: " << FLAGS_verify << std::endl; + return 0; +} diff --git a/benchmarks/analyze_better_blocks/tools/conversion/decompression-speed-s3.cpp b/benchmarks/analyze_better_blocks/tools/conversion/decompression-speed-s3.cpp new file mode 100644 index 0000000..f9837a4 --- /dev/null +++ b/benchmarks/analyze_better_blocks/tools/conversion/decompression-speed-s3.cpp @@ -0,0 +1,181 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gflags/gflags.h" + +#include "datablock/schemes/CSchemePool.hpp" +#include "datablock/BtrReader.hpp" +#include "utils/Utils.hpp" + +#include "s3-management.hpp" + +static std::atomic total_decompressed_size = 0; +static std::atomic chunk_count = 0; +static std::atomic part_count = 0; + +DEFINE_string(region, "", "Region of the S3 bucket"); +DEFINE_string(bucket, "", "Bucket that contains the btr data"); +DEFINE_string(prefix, "", "Prefix within the bucket that contains btr data"); +DEFINE_uint64(reps, 1, "Number of repetitions"); +DEFINE_uint64(prealloc, 1024, "Number of preallocated buffers"); +DEFINE_int32(threads, -1, "Limit threads of decompressor node"); +DEFINE_string(columns, "all", "List of columns to decompress"); + +DEFINE_string(fsst_stats, "", ""); // unused, defined to make linker not break + +std::vector metadata_v; + +static inline const cengine::db::FileMetadata *read_metatdata(const s3_client_t &s3_client) { + s3::Model::GetObjectRequest get_request; + get_request.SetBucket(FLAGS_bucket); + get_request.SetKey(FLAGS_prefix + "/metadata"); + + auto outcome = s3_client.GetObject(get_request); + if (!outcome.IsSuccess()) { + throw std::runtime_error(outcome.GetError().GetMessage()); + } + + metadata_v.resize(outcome.GetResult().GetContentLength()); + outcome.GetResult().GetBody().read(metadata_v.data(), metadata_v.size()); + auto metadata = reinterpret_cast(metadata_v.data()); + + /* This stream is intentionally not release */ + return metadata; +} + +static uint64_t decompressPart(long idx) { + void *buffer = streambufarrays[idx].data(); + + // Open Reader + cengine::db::BtrReader reader(buffer); + + // Decompress content + uint64_t local_decompressed_size = 0; + for (u32 chunk_i = 0; chunk_i < reader.getChunkCount(); chunk_i++) { + thread_local std::vector decompressed_data; + reader.readColumn(decompressed_data, chunk_i); + local_decompressed_size += reader.getDecompressedDataSize(chunk_i); + } + + total_decompressed_size += local_decompressed_size; + chunk_count += reader.getChunkCount(); + part_count += 1; + + s3_decompressPartFinish(idx); + + return local_decompressed_size; +} + +static inline void requestColumn(const s3_client_t& s3_client, const cengine::db::FileMetadata *file_metadata, u32 column) { + for (u32 part_i = 0; part_i < file_metadata->parts[column].num_parts; part_i++) { + std::stringstream key; + key << FLAGS_prefix << "/" << "column" << column << "_part" << part_i; + s3_requestFile(s3_client, key.str()); + } +} + +int main(int argc, char **argv) { + gflags::ParseCommandLineFlags(&argc, &argv, true); + cengine::db::CSchemePool::refresh(); + + if (FLAGS_region.empty() || FLAGS_bucket.empty()) { + std::cerr << "Region and Bucket are required" << std::endl; + return 1; + } + + if (FLAGS_prealloc == 0) { + std::cerr << "Must prealloc at least 1 buffer" << std::endl; + return 1; + } + + // Prepare S3 API. + Aws::SDKOptions options; + Aws::InitAPI(options); + { + s3_client_t s3_client = s3_get_client(FLAGS_region); + + // Read the metadata + const cengine::db::FileMetadata *file_metadata = read_metatdata(s3_client); + + std::vector columns; + if (FLAGS_columns == "all") { + columns.resize(file_metadata->num_columns); + std::iota(columns.begin(), columns.end(), 0); + } else { + char column_string[FLAGS_columns.length()]; + strcpy(column_string, FLAGS_columns.c_str()); + char *i_str = strtok(column_string, ","); + while (i_str != nullptr) { + int i = std::stoi(i_str); + if (i < 0 || i >= file_metadata->num_columns) { + std::cerr << "Selected column is out of range. " << i << " not in [0, " << file_metadata->num_columns-1 << "]" << std::endl; + return 1; + } + columns.push_back(i); + i_str = strtok(nullptr, ","); + } + if (columns.empty()) { + std::cerr << "columns invalid" << std::endl; + exit(1); + } + } + + u32 total_parts = 0; + for (u32 column : columns) { + total_parts += file_metadata->parts[column].num_parts; + } + + std::size_t threads = tbb::flow::unlimited; + if (FLAGS_threads > 0) { + threads = FLAGS_threads; + } + + s3_init(FLAGS_reps * total_parts, FLAGS_prealloc, threads, FLAGS_bucket, decompressPart); + + auto t1 = std::chrono::high_resolution_clock::now(); + // Start all requests asynchronously + for (uint64_t rep = 0; rep < FLAGS_reps; rep++) { + for (u32 column : columns) { + requestColumn(s3_client, file_metadata, column); + } + } + + s3_wait_for_end(); + auto t2 = std::chrono::high_resolution_clock::now(); + + auto us = std::chrono::duration_cast(t2 - t1); + double s = static_cast(us.count()) / static_cast(1e6); + + double total_downloaded_size_gib = static_cast(total_downloaded_size) / static_cast(1<<30); + double total_downloaded_size_gigabits = static_cast(total_downloaded_size * 8) / static_cast(1<<30); + double gbps = total_downloaded_size_gigabits / s; + + double total_decompressed_size_gib = static_cast(total_decompressed_size) / static_cast(1<<30); + double total_decompressed_size_mib = static_cast(total_decompressed_size) / static_cast(1<<20); + double mibs = total_decompressed_size_mib / s; + + // Add requests for metadata + total_requests += FLAGS_reps; + + //std::cout << (total_parts * FLAGS_reps) << " " << part_count << " " << chunk_count_2 << " " << chunk_count << " " << total_downloaded_size << " " << total_decompressed_size << std::endl; + + std::cout << "Runtime[s]: " << s + << " Downloaded[GiB]: " << total_downloaded_size_gib + << " Bandwidth[Gbps]: " << gbps + << " Decompressed[GiB]: " << total_decompressed_size_gib + << " Decompression[MiB/s]: " << mibs + << " Requests: " << total_requests + << std::endl; + + s3_free_buffers(); + } + Aws::ShutdownAPI(options); +} diff --git a/benchmarks/analyze_better_blocks/tools/conversion/decompression-speed.cpp b/benchmarks/analyze_better_blocks/tools/conversion/decompression-speed.cpp new file mode 100644 index 0000000..50861fa --- /dev/null +++ b/benchmarks/analyze_better_blocks/tools/conversion/decompression-speed.cpp @@ -0,0 +1,241 @@ +// +// Created by david on 27.04.22. +// + +#include +#include +#include +#include + +#include "gflags/gflags.h" +#include "tbb/parallel_for.h" +#include "tbb/task_scheduler_init.h" + +#include "datablock/schemes/CSchemePool.hpp" +#include "datablock/BtrReader.hpp" +#include "PerfEvent.hpp" +#include "utils/Utils.hpp" + +DEFINE_string(btr, "btr", "Directory with btr input"); +DEFINE_int32(threads, 1, "Number of threads used. not specifying lets tbb decide"); +DEFINE_string(fsst_stats, "", ""); // unused, defined to make linker not break +DEFINE_int32(column, -1, "Select a specific column to measure"); +DEFINE_string(typefilter, "", "Only measure columns with given type"); +//DEFINE_int32(chunk, -1, "Select a specific chunk to measure"); +DEFINE_uint32(reps, 1, "Loop reps times"); +DEFINE_bool(perfevent, false, "Profile with perf event if true"); +DEFINE_bool(output_summary, false, "Output a summary of total speed and size"); +DEFINE_bool(output_columns, true, "Output speeds and sizes for single columns"); +DEFINE_bool(print_simd_debug, false, "Print SIMD usage debug information"); + +void reset_bitmaps(const cengine::db::FileMetadata *metadata, std::vector> &readers, std::vector &columns) { + tbb::parallel_for_each(columns, [&](u32 column_i) { + tbb::parallel_for(u32(0), metadata->parts[column_i].num_parts, [&](u32 part_i) { + auto &reader = readers[column_i][part_i]; + tbb::parallel_for(u32(0), reader.getChunkCount(), [&](u32 chunk_i) { + reader.releaseBitmap(chunk_i); + }); + }); + }); +} + +u64 measure(const cengine::db::FileMetadata *metadata, std::vector> &readers, std::vector &runtimes, std::vector &columns) { + // Make sure no bitmap is cached + reset_bitmaps(metadata, readers, columns); + + auto total_start_time = std::chrono::steady_clock::now(); + + tbb::parallel_for_each(columns, [&](u32 column_i) { + // TODO not sure if measuring the time like that will cause problems + auto start_time = std::chrono::steady_clock::now(); + tbb::parallel_for(u32(0), metadata->parts[column_i].num_parts, [&](u32 part_i) { + auto &reader = readers[column_i][part_i]; + tbb::parallel_for(u32(0), reader.getChunkCount(), [&](u32 chunk_i) { + thread_local std::vector decompressed_data; + reader.readColumn(decompressed_data, chunk_i); + }); + }); + auto end_time = std::chrono::steady_clock::now(); + auto runtime = std::chrono::duration_cast(end_time - start_time); + runtimes[column_i] += runtime.count(); + }); + + auto total_end_time = std::chrono::steady_clock::now(); + auto total_runtime = std::chrono::duration_cast(total_end_time - total_start_time); + return total_runtime.count(); +} + +u64 measure_single_thread(const cengine::db::FileMetadata *metadata, std::vector> &readers, std::vector &runtimes, std::vector &columns) { + reset_bitmaps(metadata, readers, columns); + + auto total_start_time = std::chrono::steady_clock::now(); + for (u32 column_i : columns) { + for (u32 part_i = 0; part_i < metadata->parts[column_i].num_parts; part_i++) { + auto &reader = readers[column_i][part_i]; + for (u32 chunk_i = 0; chunk_i < reader.getChunkCount(); chunk_i++) { + for (u32 rep = 0; rep < FLAGS_reps; rep++) { + thread_local std::vector decompressed_data; + reader.releaseBitmap(chunk_i); + auto start_time = std::chrono::steady_clock::now(); + reader.readColumn(decompressed_data, chunk_i); + auto end_time = std::chrono::steady_clock::now(); + auto runtime = std::chrono::duration_cast(end_time - start_time); + runtimes[column_i] += runtime.count(); + } + } + } + } + auto total_end_time = std::chrono::steady_clock::now(); + auto total_runtime = std::chrono::duration_cast(total_end_time - total_start_time); + return total_runtime.count(); +} + +int main(int argc, char **argv) { + if (FLAGS_print_simd_debug) { +#if BTR_USE_SIMD + std::cerr << "using simd" << std::endl; +#else + std::cerr << "NOT using simd" << std::endl; +#endif + } + gflags::ParseCommandLineFlags(&argc, &argv, true); + std::filesystem::path btr_dir = FLAGS_btr; + + cengine::db::CSchemePool::refresh(); + + int threads; + if (FLAGS_threads < 1) { + // Automatic selection + threads = -1; + } else { + threads = FLAGS_threads; + } + tbb::task_scheduler_init init(threads); + + // Read the metadata + std::vector raw_file_metadata; + const cengine::db::FileMetadata *file_metadata; + { + auto metadata_path = btr_dir / "metadata"; + cengine::Utils::readFileToMemory(metadata_path.string(), raw_file_metadata); + file_metadata = reinterpret_cast(raw_file_metadata.data()); + } + + // Filter columns + ColumnType typefilter; + if (FLAGS_typefilter.empty()) { + typefilter = ColumnType::UNDEFINED; + } else if (FLAGS_typefilter == "integer") { + typefilter = ColumnType::INTEGER; + } else if (FLAGS_typefilter == "double") { + typefilter = ColumnType::DOUBLE; + } else if (FLAGS_typefilter == "string") { + typefilter = ColumnType::STRING; + } else { + throw std::runtime_error("filter_type must be one of [integer, double, string]"); + } + + std::vector columns; + if (FLAGS_column != -1) { + if (typefilter != ColumnType::UNDEFINED && file_metadata->parts[FLAGS_column].type != typefilter) { + std::cerr << "Type of selected column " << FLAGS_column << " does not match filtered type" << std::endl; + exit(EXIT_FAILURE); + } + columns.push_back(FLAGS_column); + } else if (typefilter != ColumnType::UNDEFINED) { + for (u32 column = 0; column < file_metadata->num_columns; column++) { + if (file_metadata->parts[column].type == typefilter) { + columns.push_back(column); + } + } + } else { + columns.resize(file_metadata->num_columns); + std::iota(columns.begin(), columns.end(), 0); + } + + std::cerr << "Decompressing columns: "; + for (u32 column : columns) { + std::cerr << column << " "; + } + std::cerr << std::endl; + + // Prepare the readers + std::vector> readers(file_metadata->num_columns); + std::vector>> compressed_data(file_metadata->num_columns); + tbb::parallel_for_each(columns, [&](u32 column_i) { + compressed_data[column_i].resize(file_metadata->parts[column_i].num_parts); + for (u32 part_i = 0; part_i < file_metadata->parts[column_i].num_parts; part_i++) { + auto path = btr_dir / ("column" + std::to_string(column_i) + "_part" + std::to_string(part_i)); + cengine::Utils::readFileToMemory(path.string(), compressed_data[column_i][part_i]); + readers[column_i].emplace_back(compressed_data[column_i][part_i].data()); + } + }); + std::vector runtimes(file_metadata->num_columns); + + // Measure once to make sure all buffers are allocated properly + measure(file_metadata, readers, runtimes, columns); + std::fill(runtimes.begin(), runtimes.end(), 0); + + u64 total_runtime = 0; + // Actual measurement + if (false) { + total_runtime = measure_single_thread(file_metadata, readers, runtimes, columns); + } else { + for (u32 rep = 0; rep < FLAGS_reps; rep++) { + total_runtime += measure(file_metadata, readers, runtimes, columns); + } + } + + // Collect sizes + std::vector decompressed_sizes(file_metadata->num_columns, 0); + std::vector compressed_sizes(file_metadata->num_columns, 0); + size_t total_size = 0; + size_t total_compressed_size = 0; + for (u32 column_i : columns) { + for (u32 part_i = 0; part_i < file_metadata->parts[column_i].num_parts; part_i++) { + cengine::db::BtrReader &reader = readers[column_i][part_i]; + compressed_sizes[column_i] += compressed_data[column_i][part_i].size(); + for (u32 chunk_i = 0; chunk_i < reader.getChunkCount(); chunk_i++) { + size_t s = reader.getDecompressedDataSize(chunk_i); + decompressed_sizes[column_i] += s; + total_size += s; + } + } + total_compressed_size += compressed_sizes[column_i]; + } + + if (FLAGS_output_columns) { + for (u32 column_i : columns) { + double average_runtime = static_cast(runtimes[column_i]) / static_cast(FLAGS_reps); + double mb = static_cast(decompressed_sizes[column_i]) / (1024.0 * 1024.0); + double s = average_runtime / (1000.0 * 1000.0); + double mbs = mb/s; + + cengine::db::BtrReader &reader = readers[column_i][0]; + double size_per_chunk = static_cast(decompressed_sizes[column_i]) / static_cast(file_metadata->num_chunks); + std::cout << std::to_string(column_i) + << " " << ConvertTypeToString(reader.getColumnType()) + << " " << reader.getBasicSchemeDescription(0) + << " " << compressed_sizes[column_i] << " Bytes" + << " " << decompressed_sizes[column_i] << " Bytes" + << " " << size_per_chunk << " Bytes" + << " " << average_runtime << " us" + << " " << mbs << " MB/s" + << std::endl; + } + } + + if (FLAGS_output_summary) { + double average_runtime = static_cast(total_runtime) / static_cast(FLAGS_reps); + double mb = static_cast(total_size) / (1024.0 * 1024.0); + double s = average_runtime / (1000.0 * 1000.0); + double mbs = mb / s; + + std::cout << "Total:" + << " " << total_compressed_size << " Bytes" + << " " << total_size << " Bytes" + << " " << average_runtime << " us" + << " " << mbs << " MB/s" + << std::endl; + } +} diff --git a/benchmarks/analyze_better_blocks/tools/conversion/s3-management.hpp b/benchmarks/analyze_better_blocks/tools/conversion/s3-management.hpp new file mode 100644 index 0000000..f3d4ab2 --- /dev/null +++ b/benchmarks/analyze_better_blocks/tools/conversion/s3-management.hpp @@ -0,0 +1,207 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace s3 = Aws::S3Crt; +using s3_client_t = s3::S3CrtClient; + +static const char *allocation_tag = "btr"; +std::string s3_bucket; +std::string s3_region; + +// These counters are here for debugging +static std::atomic allocated = 0; +static std::atomic releasedStreams = 0; +static std::atomic releasedBuffers = 0; + +std::vector get_requests; +static long num_preallocated_buffers; // We may be able to tune this down. +static const long part_size = 16 * 1024 * 1024; // 16 MiB +static Aws::IOStreamFactory response_stream_factory; +static std::vector> streambufarrays; +static std::vector streambuflens; +static std::vector streambufs; +static std::mutex buffer_mutex; +static std::condition_variable buffer_cv; +static std::queue buffers_available; +static tbb::concurrent_unordered_map occupied_map; + +static std::atomic total_downloaded_size = 0; +static std::atomic total_requests = 0; + +std::mutex mutex; +std::condition_variable condition_variable; +long remaining_results; +std::function&)> callback; +size_t next_index = 0; + +tbb::flow::graph g; +size_t node_limit = 72; +tbb::flow::function_node *decompressor = nullptr; + +static void s3_free_buffers() { + for (auto ptr : streambufs) { + Aws::Delete(ptr); + } +} + +static void s3_prepare_buffers(long prealloc) { + num_preallocated_buffers = prealloc; + for (long i = 0; i < num_preallocated_buffers; i++) { + streambufarrays.emplace_back(part_size*2); + /* Touch every page to avoid faults later */ + for (long pos = 0; pos < streambufarrays[i].size(); pos++) { + streambufarrays[i][pos] = 0; + } + streambufs.emplace_back(Aws::New(allocation_tag, streambufarrays[i].data(), static_cast(streambufarrays[i].size()))); + streambuflens.push_back(0); + // TODO we never call delete, but for now that's also not important + buffers_available.push(i); + } + std::cerr << "Prepared " << num_preallocated_buffers << " buffers." << std::endl; +} + +inline Aws::IOStream *s3_getStream() { + long idx; + { + std::unique_lock lock(buffer_mutex); + buffer_cv.wait(lock, [&] { + return !buffers_available.empty(); + }); + + idx = buffers_available.front(); + buffers_available.pop(); + } + // The SDK will call delete on the stream after the callback for getobect finishes + Aws::IOStream *result = Aws::New(allocation_tag, streambufs[idx]); + occupied_map[result] = idx; + allocated++; + return result; +} + +inline long s3_releaseStream(Aws::IOStream *stream) { + releasedStreams++; + return occupied_map[stream]; +} + +inline void s3_releaseBuffer(long idx) { + // No idea why, but we cannot reuse the PreallocatedStreamBufs. Even seeking back to 0 manually does not help. + // If we don't recreate the object it will read at random positions on second use. + // We cannot delete the of streambuf here. The deconstructor of the GetObjectRequestResult may still try to operate on it. + // Effectively we leak memory here, but for the purpose of this benchmark this is the best we can do. + streambufs[idx] = Aws::New(allocation_tag, streambufarrays[idx].data(), static_cast(streambufarrays[idx].size())); + { + std::lock_guard guard(buffer_mutex); + buffers_available.push(idx); + } + buffer_cv.notify_one(); + releasedBuffers++; +} + +void s3_GetObjectResponseReceiveHandler( + const s3_client_t*, + const s3::Model::GetObjectRequest&, + s3::Model::GetObjectOutcome outcome, + const std::shared_ptr&) { + /* + * Called once the request finishes. + * There is only one instance of this function running at any time. Therefore we pass on the downloaded object to + * a tbb construct that can actually process multiple of them in parallel. + */ + if (!outcome.IsSuccess()) { + throw std::runtime_error(outcome.GetError().GetMessage()); + } + + unsigned long length = outcome.GetResult().GetContentLength(); + total_downloaded_size += length; + unsigned long num_request = (length + part_size - 1) / part_size; + total_requests += num_request; + + // Insert pointer into node + auto stream_ptr = &(outcome.GetResult().GetBody()); + long idx = s3_releaseStream(stream_ptr); + streambuflens[idx] = length; + decompressor->try_put(idx); +} + +inline void s3_requestFile(const s3_client_t &s3_client, const std::string &key) { + auto ¤t_request = get_requests[next_index++]; + current_request.SetBucket(s3_bucket); + current_request.SetKey(key); + current_request.SetResponseStreamFactory(response_stream_factory); + + s3_client.GetObjectAsync(current_request, callback); +} + +inline void s3_decompressPartFinish(long idx) { + s3_releaseBuffer(idx); + long remaining; + { + std::lock_guard guard(mutex); + remaining = --remaining_results; + } + + if (remaining == 0) { + condition_variable.notify_one(); + } +} + +inline void s3_wait_for_end() { + // Wait until all requests are actually finished + { + std::unique_lock lock(mutex); + condition_variable.wait(lock, []{return remaining_results == 0;}); + } + // Just to make sure + g.wait_for_all(); +} + +inline void s3_init(std::size_t total_requests, long prealloc, std::size_t threads, std::string &bucket, uint64_t (*decompressPart)(long)) { + s3_prepare_buffers(prealloc); + + callback = s3_GetObjectResponseReceiveHandler; + response_stream_factory = s3_getStream; + + get_requests.resize(total_requests); + remaining_results = get_requests.size(); + + node_limit = threads; + decompressor = new tbb::flow::function_node(g, node_limit, decompressPart); + + s3_bucket = bucket; +} + +inline s3_client_t s3_get_client(std::string ®ion) { + s3_region = region; + + s3::ClientConfiguration config; + config.partSize = part_size; + config.throughputTargetGbps = 100.0; // Throughput target for c5n.18xlarge + config.region = s3_region; + // Tested dual stack once, but did not see any improvement in performance. + //config.useDualStack = true; + config.scheme = Aws::Http::Scheme::HTTP; + return s3_client_t(config); +} diff --git a/benchmarks/analyze_better_blocks/tools/datasets/.gitignore b/benchmarks/analyze_better_blocks/tools/datasets/.gitignore new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/tools/datasets/CMakeLists.txt b/benchmarks/analyze_better_blocks/tools/datasets/CMakeLists.txt new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/tools/datasets/prepare_dataset.sh b/benchmarks/analyze_better_blocks/tools/datasets/prepare_dataset.sh new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/tools/datasets/stats/CMakeLists.txt b/benchmarks/analyze_better_blocks/tools/datasets/stats/CMakeLists.txt new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/tools/datasets/stats/double-stats/CMakeLists.txt b/benchmarks/analyze_better_blocks/tools/datasets/stats/double-stats/CMakeLists.txt new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/tools/datasets/stats/double-stats/DecimalApplication.cpp b/benchmarks/analyze_better_blocks/tools/datasets/stats/double-stats/DecimalApplication.cpp new file mode 100644 index 0000000..c8805eb --- /dev/null +++ b/benchmarks/analyze_better_blocks/tools/datasets/stats/double-stats/DecimalApplication.cpp @@ -0,0 +1,119 @@ +#include "Units.hpp" +#include "MMapvector.hpp" +// ------------------------------------------------------------------------------------- +#include "gflags/gflags.h" +// ------------------------------------------------------------------------------------- +#include +#include +#include +#include +#include +#include +// ------------------------------------------------------------------------------------- +DEFINE_bool(print_header, false, "."); +DEFINE_string(in, "", "."); +DEFINE_uint32(block_size, 65000, "."); +DEFINE_uint32(siginifcant_digit_bits_limit, 32, ""); +DEFINE_uint32(exponent_limit, 15, ""); +DEFINE_bool(eps,false,""); +using namespace std; +// ------------------------------------------------------------------------------------- +void printDouble(double input) +{ + union { + double d; + uint64_t u; + }; + + d = input; + bool sign = (u >> 63) & 0x1; + uint64_t exponent = (u >> 52) & 0x7FF; + uint64_t mantissa = u & 0xFFFFFFFFFFFFF; + + cout << sign << " " << bitset<11>(exponent) << " " << bitset<52>(mantissa) << " " + << std::setprecision(17) << d << endl; +} +// ------------------------------------------------------------------------------------- +int main(int argc, char **argv) +{ + srand(time(NULL)); + // ------------------------------------------------------------------------------------- + gflags::ParseCommandLineFlags(&argc, &argv, true); + // ------------------------------------------------------------------------------------- + string data_file = FLAGS_in, bitmap_file; + { + std::regex re("(.*).double"); + std::smatch match; + if ( std::regex_search(data_file, match, re) && match.size() > 1 ) { + bitmap_file = match.str(1) + ".bitmap"; + } + } + // ------------------------------------------------------------------------------------- + Vector column; + column.readBinary(data_file.c_str()); + Vector bitmap; + bitmap.readBinary(bitmap_file.c_str()); + auto column_count = bitmap.size(); + u32 column_set_count = 0; + assert(bitmap.size() == column.size()); + // ------------------------------------------------------------------------------------- + u32 positive = 0, negative = 0, blocks_count = 0, positive_blocks = 0; + for ( u64 offset = 0; offset < column_count; offset += FLAGS_block_size ) { + blocks_count++; + // ------------------------------------------------------------------------------------- + u64 chunk_tuple_count; + if ( offset + FLAGS_block_size >= column_count ) { + chunk_tuple_count = column_count - offset; + } else { + chunk_tuple_count = FLAGS_block_size; + } + bool block_flag = true; + DOUBLE d; + for ( u32 tuple_i = 0; tuple_i < chunk_tuple_count; tuple_i++ ) { + if ( !bitmap[offset + tuple_i] ) { + continue; + } + column_set_count++; + d = column[offset + tuple_i]; + bool flag = false; + for ( s32 e = 0; e <= FLAGS_exponent_limit; e++ ) { + double cd = d * std::pow(10, e); + u64 significant_digits = std::round(cd); + double rd = significant_digits; + double if_converted_back = std::pow(10, -e) * rd; + // ------------------------------------------------------------------------------------- + // string method + stringstream ss; + if ( e && significant_digits ) { + u32 left = (significant_digits / (std::pow(10, e))); + u32 right = (significant_digits % CU(std::pow(10, e))); + ss << left << "."; + ss << setfill('0') << setw(e) << right; + } + string str; + ss >> str; + if ( str.size()) + if_converted_back = stod(str); + // ------------------------------------------------------------------------------------- + if ( if_converted_back == d ) { + flag = true; + positive++; + cout << significant_digits << '\t' << e << endl; + break; + } + } + if ( !flag ) { + negative++; + printDouble(d); + block_flag = false; + } + } + if ( block_flag ) + positive_blocks++; + } + + cout << fixed << setprecision(4) << 100.0 * CD(positive) / CD(column_set_count) << '\t' + << positive_blocks + << endl; +} +// ------------------------------------------------------------------------------------- diff --git a/benchmarks/analyze_better_blocks/tools/datasets/stats/double-stats/DoubleStatsExec.cpp b/benchmarks/analyze_better_blocks/tools/datasets/stats/double-stats/DoubleStatsExec.cpp new file mode 100644 index 0000000..427a83e --- /dev/null +++ b/benchmarks/analyze_better_blocks/tools/datasets/stats/double-stats/DoubleStatsExec.cpp @@ -0,0 +1,113 @@ +#include "Units.hpp" +#include "Reinterpret.hpp" +#include "MMapvector.hpp" +// ------------------------------------------------------------------------------------- +#include "gflags/gflags.h" +// ------------------------------------------------------------------------------------- +#include +#include +#include +#include +#include +#include +// ------------------------------------------------------------------------------------- +DEFINE_bool(print_header, false, "."); +DEFINE_string(in, "", "."); +DEFINE_uint32(block_size, 65000, "."); +using namespace std; +// ------------------------------------------------------------------------------------- +void printDouble(double input) +{ + union { + double d; + uint64_t u; + }; + + d = input; + bool sign = (u >> 63) & 0x1; + uint64_t exponent = (u >> 52) & 0x7FF; + uint64_t mantissa = u & 0xFFFFFFFFFFFFF; + + cout << sign << " " << bitset<11>(exponent) << " " << bitset<52>(mantissa) << " " + << std::setprecision(17) << d << endl; +} +// ------------------------------------------------------------------------------------- +int main(int argc, char **argv) +{ + srand(time(NULL)); + // ------------------------------------------------------------------------------------- + gflags::ParseCommandLineFlags(&argc, &argv, true); + // ------------------------------------------------------------------------------------- + string data_file = FLAGS_in, bitmap_file; + { + std::regex re("(.*).double"); + std::smatch match; + if ( std::regex_search(data_file, match, re) && match.size() > 1 ) { + bitmap_file = match.str(1) + ".bitmap"; + } + } + // ------------------------------------------------------------------------------------- + Vector column; + column.readBinary(data_file.c_str()); + Vector bitmap; + bitmap.readBinary(bitmap_file.c_str()); + auto column_count = bitmap.size(); + u32 column_set_count = 0; + assert(bitmap.size() == column.size()); + // ------------------------------------------------------------------------------------- + // Looking for : positive_blocks(where all exponents are equal); range,min,max of exponents + u32 blocks_count = 0, positive_blocks = 0; + set exponents; + set mantissas; + set doubles; + for ( u64 offset = 0; offset < column_count; offset += FLAGS_block_size ) { + blocks_count++; + // ------------------------------------------------------------------------------------- + u64 chunk_tuple_count; + if ( offset + FLAGS_block_size >= column_count ) { + chunk_tuple_count = column_count - offset; + } else { + chunk_tuple_count = FLAGS_block_size; + } + bool block_flag = true; + bool init = false; + DOUBLE d; + s64 block_exponent = 0; + for ( u32 tuple_i = 0; tuple_i < chunk_tuple_count; tuple_i++ ) { + if ( !bitmap[offset + tuple_i] ) { + continue; + } + column_set_count++; + d = column[offset + tuple_i]; + doubles.insert(d); + s64 current_exponent = (((RU64(d)) >> 52) & 0x7FF) - 1023; + mantissas.insert(RU64(d) & 0xFFFFFFFFFFFFF); + exponents.insert(current_exponent); + if ( !init ) { + init = true; + block_exponent = current_exponent; + continue; + } else { + if ( block_flag && current_exponent != block_exponent ) { + block_flag = false; + } + } + } + if ( block_flag ) { + positive_blocks++; + } + } + + cout << fixed << setprecision(4); + cout << '"' << FLAGS_in << '"' << '\t' + << column_count << '\t' + << column_set_count << '\t' + << 100.0 * CD(positive_blocks) / CD(blocks_count) << '\t' + << *std::max_element(exponents.begin(),exponents.end()) << '\t' + << *std::min_element(exponents.begin(),exponents.end()) << '\t' + << doubles.size() << '\t' + << exponents.size() << '\t' + << mantissas.size() << '\t' + << endl; +} +// ------------------------------------------------------------------------------------- \ No newline at end of file diff --git a/benchmarks/analyze_better_blocks/tools/datasets/stats/double_stats.py b/benchmarks/analyze_better_blocks/tools/datasets/stats/double_stats.py new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/tools/datasets/stats/final/all_double.ods b/benchmarks/analyze_better_blocks/tools/datasets/stats/final/all_double.ods new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/tools/datasets/stats/final/all_doubles.csv b/benchmarks/analyze_better_blocks/tools/datasets/stats/final/all_doubles.csv new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/tools/datasets/stats/final/all_integer.csv b/benchmarks/analyze_better_blocks/tools/datasets/stats/final/all_integer.csv new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/tools/datasets/stats/final/all_integers.ods b/benchmarks/analyze_better_blocks/tools/datasets/stats/final/all_integers.ods new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/tools/datasets/stats/final/all_string.ods b/benchmarks/analyze_better_blocks/tools/datasets/stats/final/all_string.ods new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/tools/datasets/stats/final/all_string_stats.csv b/benchmarks/analyze_better_blocks/tools/datasets/stats/final/all_string_stats.csv new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/tools/datasets/stats/final/all_string_tmp.ods b/benchmarks/analyze_better_blocks/tools/datasets/stats/final/all_string_tmp.ods new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/tools/datasets/stats/final/brute_force_results.csv b/benchmarks/analyze_better_blocks/tools/datasets/stats/final/brute_force_results.csv new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/tools/datasets/stats/final/bzip_doubles.csv b/benchmarks/analyze_better_blocks/tools/datasets/stats/final/bzip_doubles.csv new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/tools/datasets/stats/final/bzip_integers.csv b/benchmarks/analyze_better_blocks/tools/datasets/stats/final/bzip_integers.csv new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/tools/datasets/stats/final/bzip_strings.csv b/benchmarks/analyze_better_blocks/tools/datasets/stats/final/bzip_strings.csv new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/tools/datasets/stats/final/compression_ratio_analysis.ods b/benchmarks/analyze_better_blocks/tools/datasets/stats/final/compression_ratio_analysis.ods new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/tools/datasets/stats/final/dict_before_after_tzt.ods b/benchmarks/analyze_better_blocks/tools/datasets/stats/final/dict_before_after_tzt.ods new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/tools/datasets/stats/final/dict_sharing_compression_ratio_2.csv b/benchmarks/analyze_better_blocks/tools/datasets/stats/final/dict_sharing_compression_ratio_2.csv new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/tools/datasets/stats/final/double_columns_count b/benchmarks/analyze_better_blocks/tools/datasets/stats/final/double_columns_count new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/tools/datasets/stats/final/integer_columns_count b/benchmarks/analyze_better_blocks/tools/datasets/stats/final/integer_columns_count new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/tools/datasets/stats/final/integers_generico.csv b/benchmarks/analyze_better_blocks/tools/datasets/stats/final/integers_generico.csv new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/tools/datasets/stats/final/integers_generico.ods b/benchmarks/analyze_better_blocks/tools/datasets/stats/final/integers_generico.ods new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/tools/datasets/stats/final/lz4_double.csv b/benchmarks/analyze_better_blocks/tools/datasets/stats/final/lz4_double.csv new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/tools/datasets/stats/final/lz4_integer.csv b/benchmarks/analyze_better_blocks/tools/datasets/stats/final/lz4_integer.csv new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/tools/datasets/stats/final/raw/db1_integers.csv b/benchmarks/analyze_better_blocks/tools/datasets/stats/final/raw/db1_integers.csv new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/tools/datasets/stats/final/raw/db2_integers.csv b/benchmarks/analyze_better_blocks/tools/datasets/stats/final/raw/db2_integers.csv new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/tools/datasets/stats/final/string_columns_count b/benchmarks/analyze_better_blocks/tools/datasets/stats/final/string_columns_count new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/tools/datasets/stats/final/strings_size.csv b/benchmarks/analyze_better_blocks/tools/datasets/stats/final/strings_size.csv new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/tools/datasets/stats/final/vardict_8_16_with_without_tzt.ods b/benchmarks/analyze_better_blocks/tools/datasets/stats/final/vardict_8_16_with_without_tzt.ods new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/tools/datasets/stats/final/xz.csv b/benchmarks/analyze_better_blocks/tools/datasets/stats/final/xz.csv new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/tools/datasets/stats/final/xz.ods b/benchmarks/analyze_better_blocks/tools/datasets/stats/final/xz.ods new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/tools/datasets/stats/integer-stats/CMakeLists.txt b/benchmarks/analyze_better_blocks/tools/datasets/stats/integer-stats/CMakeLists.txt new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/tools/datasets/stats/integer-stats/IntegerStats.cpp b/benchmarks/analyze_better_blocks/tools/datasets/stats/integer-stats/IntegerStats.cpp new file mode 100644 index 0000000..5f1dd89 --- /dev/null +++ b/benchmarks/analyze_better_blocks/tools/datasets/stats/integer-stats/IntegerStats.cpp @@ -0,0 +1,196 @@ +#include "Units.hpp" +#include "MMapvector.hpp" +// ------------------------------------------------------------------------------------- +#include "gflags/gflags.h" +// ------------------------------------------------------------------------------------- +#include +#include +#include +#include +// ------------------------------------------------------------------------------------- +DEFINE_bool(print_header, false, "."); +DEFINE_string(in, "", "."); +DEFINE_string(out_csv, "", "."); +DEFINE_string(delimiter, "\t", "."); +DEFINE_uint32(max_char_per_cell, 100, "."); +DEFINE_uint32(block_print_length, 20, "."); +DEFINE_uint32(block_count, 3, "."); +DEFINE_uint32(block_length, 65000, "."); +using namespace std; +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +map analyzeBlock(Vector &column, Vector &bitmap, u32 start_index, u32 tuple_count, bool print_block = false) +{ + map stats; + stats["random_element"] = to_string(column[start_index]); + INTEGER min, max; + bool is_starting_values_init = false; + u32 null_count = 0; + u32 zero_count = 0; + std::unordered_map frequency; + + for ( u32 tuple_i = start_index; tuple_i < start_index + tuple_count; tuple_i++ ) { + BITMAP is_set = bitmap.data[tuple_i]; + + if ( !is_set ) { + null_count++; + continue; + } + + auto current_value = column[tuple_i]; + if ( current_value == 0 ) { + zero_count++; + } + + if ( frequency.find(current_value) == frequency.end()) { + frequency.insert({current_value, 1}); + } else { + frequency[current_value] = frequency[current_value] + 1; + } + + if(is_starting_values_init) { + if ( current_value > max ) + max = current_value; + if ( current_value < min) + min = current_value; + } else { + is_starting_values_init = true; + min = max = current_value; + } + + } + const u32 set_count = tuple_count - null_count; + const u32 unique_count = frequency.size(); + { + using Comparator = function, pair)>; + // Defining a lambda function to compare two pairs. It will compare two pairs using second field + Comparator compFunctor = + [](pair elem1, pair elem2) { + return elem1.second > elem2.second; + }; + // Declaring a set that will store the pairs using above comparision logic + set, Comparator> frequency_set(frequency.begin(), frequency.end(), compFunctor); + u32 top_i = 1; + for ( const auto &element: frequency_set ) { + INTEGER value = element.first; + double frequency = static_cast(element.second) * 100.0 / static_cast(set_count); + string key_prefix = "top_" + to_string(top_i); + string value_key = key_prefix + "_value"; + string percent_key = key_prefix + "_percent"; + + stats[value_key] = to_string(value); + stats[percent_key] = to_string(frequency); + if ( top_i++ == ((unique_count >= 3) ? 3 : unique_count)) { + break; + } + } + for ( ; top_i <= 3; top_i++ ) { + string key_prefix = "top_" + to_string(top_i); + string value_key = key_prefix + "_value"; + string percent_key = key_prefix + "_percent"; + + stats[value_key] = ""; + stats[percent_key] = ""; + } + } + + stats["min"] = to_string(min); + stats["max"] = to_string(max); + stats["null_count"] = to_string(null_count); + stats["zero_count"] = to_string(zero_count); + stats["unique_count"] = to_string(unique_count); + // ------------------------------------------------------------------------------------- + if ( print_block ) { + string block_rep = ""; + for ( u32 tuple_i = start_index + 1; tuple_i < start_index + FLAGS_block_print_length; tuple_i++ ) { + BITMAP is_set = bitmap.data[tuple_i]; + if ( !is_set ) { + block_rep += "N"; + } else if ( column[tuple_i] == column[tuple_i - 1] ) { + block_rep += "."; + } else { + block_rep += "x"; + } + } + stats["block"] = block_rep; + } + // ------------------------------------------------------------------------------------- + return stats; +} +int main(int argc, char **argv) +{ + srand(time(NULL)); + // ------------------------------------------------------------------------------------- + gflags::ParseCommandLineFlags(&argc, &argv, true); + // ------------------------------------------------------------------------------------- + assert(FLAGS_out_csv.size()); + string data_file = FLAGS_in, bitmap_file; + { + std::regex re("(.*).integer"); + std::smatch match; + if ( std::regex_search(data_file, match, re) && match.size() > 1 ) { + bitmap_file = match.str(1) + ".bitmap"; + } + } + // ------------------------------------------------------------------------------------- + Vector column; + column.readBinary(data_file.c_str()); + Vector bitmap; + bitmap.readBinary(bitmap_file.c_str()); + auto tuple_count = bitmap.size(); + assert(bitmap.size() == column.size()); + // ------------------------------------------------------------------------------------- + map stats; + stats["col_count"] = to_string(column.size()); + { + std::regex re("(\\/[^\\/]+\\/[^\\/]+).integer"); + std::smatch match; + if ( std::regex_search(data_file, match, re) && match.size() > 1 ) { + stats["col_id"] = '"' + match.str(1) + '"'; + } + } + // ------------------------------------------------------------------------------------- + auto whole_column = analyzeBlock(column, bitmap, 0, tuple_count); + for ( const auto &element: whole_column ) { + stats["col_" + element.first] = element.second; + } + // ------------------------------------------------------------------------------------- + for ( u32 block_i = 1; block_i <= FLAGS_block_count; block_i++ ) { + u32 start_index = rand() % tuple_count; + u32 block_length = std::min(FLAGS_block_length, static_cast(tuple_count - start_index)); + auto block = analyzeBlock(column, bitmap, start_index, block_length, true); + for ( const auto &element: block ) { + stats["block_" + to_string(block_i) + "_" + element.first] = element.second; + } + } + // ------------------------------------------------------------------------------------- + + std::ofstream csv; + csv.open(FLAGS_out_csv, std::ofstream::out | std::ofstream::app); + assert(csv.good()); + if ( csv.tellp() == 0 ) { + for ( auto it = stats.begin(); it != stats.end(); ) { + csv << it->first; + if ( ++it != stats.end()) { + csv << FLAGS_delimiter; + } + } + csv << endl; + } + for ( auto it = stats.begin(); it != stats.end(); ) { + auto sub_str = it->second.substr(0, FLAGS_max_char_per_cell); + std::regex tabs_regex("\\t"); + std::regex nl_regex("\\n"); + auto sterilized_value = std::regex_replace(sub_str, tabs_regex, " "); + sterilized_value = std::regex_replace(sterilized_value, nl_regex, " "); + + csv << sterilized_value; + if ( ++it != stats.end()) { + csv << FLAGS_delimiter; + } + } + csv << endl; + // ------------------------------------------------------------------------------------- + return 0; +} +// ------------------------------------------------------------------------------------- diff --git a/benchmarks/analyze_better_blocks/tools/datasets/stats/integer_stats.py b/benchmarks/analyze_better_blocks/tools/datasets/stats/integer_stats.py new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/tools/datasets/stats/string-stats/CMakeLists.txt b/benchmarks/analyze_better_blocks/tools/datasets/stats/string-stats/CMakeLists.txt new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/tools/datasets/stats/string-stats/StringFSST.cpp b/benchmarks/analyze_better_blocks/tools/datasets/stats/string-stats/StringFSST.cpp new file mode 100644 index 0000000..3166b7b --- /dev/null +++ b/benchmarks/analyze_better_blocks/tools/datasets/stats/string-stats/StringFSST.cpp @@ -0,0 +1,106 @@ +#include "Units.hpp" +#include +#include +#include +#include +#include /* srand, rand */ +#include +#include /* time */ +#include +#include +#include +#include "MMapvector.hpp" +#include "fsst.h" + +#include "gflags/gflags.h" +DEFINE_uint32(n, 10, ""); +DEFINE_uint32(s, 100, ""); +DEFINE_uint32(e, 26, ""); +DEFINE_string(in, "", ""); + +using namespace std; +int main(int argc, char **argv) +{ + srand(time(NULL)); + gflags::SetUsageMessage(""); + gflags::ParseCommandLineFlags(&argc, &argv, true); + // ------------------------------------------------------------------------------------- + set unique_strings; + Vector input_strings; + + string bitmap_file; + { + std::regex re("(.*).string"); + std::smatch match; + if ( std::regex_search(FLAGS_in, match, re) && match.size() > 1 ) { + bitmap_file = match.str(1) + ".bitmap"; + } + } + Vector bitmap; + bitmap.readBinary(bitmap_file.c_str()); + input_strings.readBinary(FLAGS_in.c_str()); + // ------------------------------------------------------------------------------------- + // extract unique values + u32 before_size = 0; + u32 start_index = rand() % input_strings.size(); + u32 tuple_count = std::min(static_cast(input_strings.size() - start_index), 65000ul); + if ( start_index == input_strings.size()) { + start_index = 0; + tuple_count = std::min(input_strings.size(), 65000ul); + } + for ( u32 tuple_i = start_index; tuple_i < start_index + tuple_count; tuple_i++ ) { + if ( bitmap[tuple_i] == 0 ) { + continue; + } + auto current_str = input_strings[tuple_i]; + if ( unique_strings.find(current_str) == unique_strings.end()) { + before_size += current_str.length(); + unique_strings.insert(current_str); + } + } + // ------------------------------------------------------------------------------------- + unsigned long n = unique_strings.size(); + if ( n == 0 ) { + cout << "--" << "\t" << "--" << "\t" << "--" << "\t" << "--" << "\t" << "--" << "\t" << FLAGS_in << endl; + return 0; + } + u8 **srcBuf = (u8 **) calloc(n, sizeof(u8 *)); + u8 **dstBuf = (u8 **) calloc(n, sizeof(u8 *)); + u64 *srcLen = (u64 *) calloc(n, sizeof(u64)); + u64 *dstLen = (u64 *) calloc(n, sizeof(u64)); + // ------------------------------------------------------------------------------------- + auto unique_strings_buffers = std::unique_ptr(new u8[before_size]); + u8 *write_ptr = unique_strings_buffers.get(); + u32 i = 0; + for ( const auto &unique_str : unique_strings ) { + srcBuf[i] = write_ptr; + srcLen[i] = unique_str.size(); + memcpy(write_ptr, unique_str.data(), unique_str.size()); + write_ptr += unique_str.size(); + i++; + } + // ------------------------------------------------------------------------------------- + unsigned char serialized_encoder_buf[FSST_MAXHEADER]; + fsst_encoder_t *encoder = fsst_create(n, srcLen, srcBuf, 0); + unsigned long hdr = fsst_export(encoder, serialized_encoder_buf); + + unsigned long output_buffer_size = 7 + 4 * before_size;//1024 * 1024 * 1024 + auto output_buffer = (u8 *) calloc(output_buffer_size, sizeof(u8)); + + auto n_compressed_strings = fsst_compress(encoder, n, srcLen, srcBuf, output_buffer_size, output_buffer, + dstLen, dstBuf); + assert(n_compressed_strings == n); + u32 after_size = hdr; + for ( u32 tuple_i = 0; tuple_i < n; tuple_i++ ) { + after_size += dstLen[tuple_i]; + } +// cout << "n" << "\t" << "before_size" << "\t" << "after_size" << "\t" << "fsst_compression_ratio" << "\t" << "hdr" << "\t" << "path" << "\t" << endl; + cout << n << "\t" << before_size << "\t" << after_size << "\t" << static_cast(before_size) / static_cast(after_size) << "\t" << hdr << "\t" << FLAGS_in << endl; + fsst_destroy(encoder); + return 0; +} +/* + * Notes: + * 1- one you pass an array of input pointers, they should not be equal, otherwise srcLen will be equal to destLen + * + */ diff --git a/benchmarks/analyze_better_blocks/tools/datasets/stats/string-stats/StringSharing.cpp b/benchmarks/analyze_better_blocks/tools/datasets/stats/string-stats/StringSharing.cpp new file mode 100644 index 0000000..18227d4 --- /dev/null +++ b/benchmarks/analyze_better_blocks/tools/datasets/stats/string-stats/StringSharing.cpp @@ -0,0 +1,92 @@ +#include "Units.hpp" +#include "MMapvector.hpp" +#include "Exceptions.hpp" +#include "parser/Parser.hpp" +#include "storage/Relation.hpp" +#include "datablock/Datablock.hpp" +#include "datablock/CMachine.hpp" +#include "datablock/schemes/CSchemePool.hpp" +// ------------------------------------------------------------------------------------- +#include "gflags/gflags.h" +#include "yaml-cpp/yaml.h" +#include "spdlog/spdlog.h" +#include "spdlog/sinks/rotating_file_sink.h" +#include "tbb/parallel_for.h" +#include "tbb/task_scheduler_init.h" +#include "roaring/roaring.hh" +// ------------------------------------------------------------------------------------- +#include +#include +#include +#include +#include +#include +// ------------------------------------------------------------------------------------- +using namespace std; +// ------------------------------------------------------------------------------------- +DEFINE_string(in, "", "CSV input file path you want to parse without csv extension"); +DEFINE_string(out, "", "Output directory for parsed columns (binary format)"); +DEFINE_bool(verify, false, ""); +DEFINE_bool(parse, false, "Parse the data before processing"); +DEFINE_bool(only_parse, false, "Stop after parsing"); +DEFINE_bool(print_chunk_sample, false, ""); +DEFINE_string(single_in, "", "Prase single column file only"); +DEFINE_uint32(chunks, 1, "Limit the maximum number of processed relation chunks"); +DEFINE_uint32(split_strategy, 1, ""); +DEFINE_uint32(threads, 20, ""); +DEFINE_uint32(schemes, 15, ""); +// ------------------------------------------------------------------------------------- +DEFINE_string(yaml, "", ""); +// ------------------------------------------------------------------------------------- +int main(int argc, char **argv) +{ + // Relation::split was replace by Relation::getRanges in 010ace27, but this code was not adjusted + // Commenting out for now, as I have no idea what it is good for atm. + + /* + gflags::ParseCommandLineFlags(&argc, &argv, true); + cengine::db::CSchemePool::refresh(); + // ------------------------------------------------------------------------------------- + tbb::task_scheduler_init init(FLAGS_threads); + //tbb::task_scheduler_init init(tbb::task_scheduler_init::default_num_threads()); // Explicit number of threads + // ------------------------------------------------------------------------------------- + const string schema_path = FLAGS_yaml; + const string out_dir = schema_path.substr(0, schema_path.size() - str(".yaml").size()) + "/"; + // ------------------------------------------------------------------------------------- + cengine::Relation relation; + const auto schema = YAML::LoadFile(schema_path); + // Create relation out of yaml schema + relation = cengine::Relation(schema, out_dir); + auto chunks = relation.split(static_cast(FLAGS_split_strategy), FLAGS_chunks); + // ------------------------------------------------------------------------------------- + auto string_columns = vector(); + for ( u32 col_i = 0; col_i < relation.columns.size(); col_i++ ) { + if ( relation.columns[col_i].type == ColumnType::STRING ) { + string_columns.push_back(col_i); + } + } + // ------------------------------------------------------------------------------------- + u32 before_size = 0; + u32 after_size = 0; + for ( const auto &chunk: chunks ) { + std::map all_strings; + for ( auto col_i : string_columns ) { + for ( u32 row_i = 0; row_i < chunk.tuple_count; row_i++ ) { + auto current_str = chunk(col_i, row_i); + auto it = all_strings.find(current_str); + if ( it == all_strings.end()) { + all_strings.insert({current_str, {}}); + } + all_strings[current_str].add(col_i); + } + } + for ( const auto &str: all_strings ) { + auto &r = str.second; + before_size += str.first.length() * r.cardinality(); + after_size += str.first.length() * 1; + } + } + cout << before_size << '\t' << after_size << '\t' << CD(before_size) / CD(after_size) << endl; + */ + return 0; +} \ No newline at end of file diff --git a/benchmarks/analyze_better_blocks/tools/datasets/stats/string-stats/StringStats.cpp b/benchmarks/analyze_better_blocks/tools/datasets/stats/string-stats/StringStats.cpp new file mode 100644 index 0000000..bb8fb74 --- /dev/null +++ b/benchmarks/analyze_better_blocks/tools/datasets/stats/string-stats/StringStats.cpp @@ -0,0 +1,192 @@ +#include "Units.hpp" +#include "MMapvector.hpp" +// ------------------------------------------------------------------------------------- +#include "gflags/gflags.h" +// ------------------------------------------------------------------------------------- +#include +#include +#include +#include +// ------------------------------------------------------------------------------------- +DEFINE_bool(print_header, false, "."); +DEFINE_string(in, "", "."); +DEFINE_string(out_csv, "", "."); +DEFINE_string(delimiter, "\t", "."); +DEFINE_uint32(max_char_per_cell, 100, "."); +DEFINE_uint32(block_print_length, 20, "."); +DEFINE_uint32(block_count, 3, "."); +DEFINE_uint32(block_length, 65000, "."); +using namespace std; +// ------------------------------------------------------------------------------------- +// ------------------------------------------------------------------------------------- +map analyzeBlock(Vector &column, Vector &bitmap, u32 start_index, u32 tuple_count, bool print_block = false) +{ + map stats; + stats["random_element"] = column[start_index]; + u32 min_length = column[start_index].length(), max_length = column[start_index].length(); + u32 null_count = 0; + u32 zero_count = 0; + std::unordered_map frequency; + u64 sum_length = 0; + + for ( u32 tuple_i = start_index; tuple_i < start_index + tuple_count; tuple_i++ ) { + BITMAP is_set = bitmap.data[tuple_i]; + + if ( !is_set ) { + null_count++; + continue; + } + + auto current_value = column[tuple_i]; + if ( current_value.size() == 0 ) { + zero_count++; + } + + if ( frequency.find(current_value) == frequency.end()) { + frequency.insert({current_value, 1}); + } else { + frequency[current_value] = frequency[current_value] + 1; + } + if ( current_value.length() > max_length ) + max_length = current_value.length(); + if ( current_value.length() < min_length ) + min_length = current_value.length(); + + sum_length += current_value.length(); + } + const u32 set_count = tuple_count - null_count; + const u32 unique_count = frequency.size(); + { + using Comparator = function, pair)>; + // Defining a lambda function to compare two pairs. It will compare two pairs using second field + Comparator compFunctor = + [](pair elem1, pair elem2) { + return elem1.second > elem2.second; + }; + // Declaring a set that will store the pairs using above comparision logic + set, Comparator> frequency_set(frequency.begin(), frequency.end(), compFunctor); + u32 top_i = 1; + for ( const auto &element: frequency_set ) { + str value = element.first; + double frequency = static_cast(element.second) * 100.0 / static_cast(set_count); + string key_prefix = "top_" + to_string(top_i); + string value_key = key_prefix + "_value"; + string percent_key = key_prefix + "_percent"; + + stats[value_key] = value; + stats[percent_key] = to_string(frequency); + if ( top_i++ == ((unique_count >= 3) ? 3 : unique_count)) { + break; + } + } + for ( ; top_i <= 3; top_i++ ) { + string key_prefix = "top_" + to_string(top_i); + string value_key = key_prefix + "_value"; + string percent_key = key_prefix + "_percent"; + + stats[value_key] = ""; + stats[percent_key] = ""; + } + } + float average_length = sum_length / tuple_count; + + stats["min"] = to_string(min_length); + stats["min"] = to_string(max_length); + stats["null_count"] = to_string(null_count); + stats["zero_count"] = to_string(zero_count); + stats["unique_count"] = to_string(unique_count); + stats["average_length"] = to_string(average_length); + // ------------------------------------------------------------------------------------- + if ( print_block ) { + string block_rep = ""; + for ( u32 tuple_i = start_index + 1; tuple_i < start_index + FLAGS_block_print_length; tuple_i++ ) { + BITMAP is_set = bitmap.data[tuple_i]; + if ( !is_set ) { + block_rep += "N"; + } else if ( column[tuple_i] == column[tuple_i - 1] ) { + block_rep += "."; + } else { + block_rep += "x"; + } + } + stats["block"] = block_rep; + } + // ------------------------------------------------------------------------------------- + return stats; +} +int main(int argc, char **argv) +{ + srand(time(NULL)); + // ------------------------------------------------------------------------------------- + gflags::SetUsageMessage("CSV Dataset parser"); + gflags::ParseCommandLineFlags(&argc, &argv, true); + // ------------------------------------------------------------------------------------- + assert(FLAGS_out_csv.size()); + string data_file = FLAGS_in, bitmap_file; + { + std::regex re("(.*).string"); + std::smatch match; + if ( std::regex_search(data_file, match, re) && match.size() > 1 ) { + bitmap_file = match.str(1) + ".bitmap"; + } + } + // ------------------------------------------------------------------------------------- + Vector column; + column.readBinary(data_file.c_str()); + Vector bitmap; + bitmap.readBinary(bitmap_file.c_str()); + auto tuple_count = bitmap.size(); + assert(bitmap.size() == column.size()); + // ------------------------------------------------------------------------------------- + map stats; + { + std::regex re("(\\/[^\\/]+\\/[^\\/]+).string"); + std::smatch match; + if ( std::regex_search(data_file, match, re) && match.size() > 1 ) { + stats["col_id"] = match.str(1); + } + } + // ------------------------------------------------------------------------------------- + auto whole_column = analyzeBlock(column, bitmap, 0, tuple_count); + for ( const auto &element: whole_column ) { + stats["col_" + element.first] = element.second; + } + // ------------------------------------------------------------------------------------- + for ( u32 block_i = 1; block_i <= FLAGS_block_count; block_i++ ) { + u32 start_index = rand() % tuple_count; + u32 block_length = std::min(FLAGS_block_length, static_cast(tuple_count - start_index)); + auto block = analyzeBlock(column, bitmap, start_index, block_length, true); + for ( const auto &element: block ) { + stats["block_" + to_string(block_i) + "_" + element.first] = element.second; + } + } + + std::ofstream csv; + csv.open(FLAGS_out_csv, std::ofstream::out | std::ofstream::app); + assert(csv.good()); + if ( csv.tellp() == 0 ) { + for ( auto it = stats.begin(); it != stats.end(); ) { + csv << it->first; + if ( ++it != stats.end()) { + csv << FLAGS_delimiter; + } + } + csv << endl; + } + for ( auto it = stats.begin(); it != stats.end(); ) { + auto sub_str = it->second.substr(0, FLAGS_max_char_per_cell); + std::regex tabs_regex("\\t"); + std::regex nl_regex("\\n"); + auto sterilized_value = std::regex_replace(sub_str, tabs_regex, " "); + sterilized_value = std::regex_replace(sterilized_value, nl_regex, " "); + + csv << sterilized_value; + if ( ++it != stats.end()) { + csv << FLAGS_delimiter; + } + } + csv << endl; + // ------------------------------------------------------------------------------------- + return 0; +} +// ------------------------------------------------------------------------------------- diff --git a/benchmarks/analyze_better_blocks/tools/datasets/string_stats.sh b/benchmarks/analyze_better_blocks/tools/datasets/string_stats.sh new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/tools/engine-comparison/note.txt b/benchmarks/analyze_better_blocks/tools/engine-comparison/note.txt new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/tools/list-s3-btrfiles.sh b/benchmarks/analyze_better_blocks/tools/list-s3-btrfiles.sh new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/tools/misc/create_table_parser.cpp b/benchmarks/analyze_better_blocks/tools/misc/create_table_parser.cpp new file mode 100644 index 0000000..a30d2d5 --- /dev/null +++ b/benchmarks/analyze_better_blocks/tools/misc/create_table_parser.cpp @@ -0,0 +1,12 @@ +#include +#include + +using namespace std; +int main(int argc, char **argv){ + + ofstream input_file; + input_file.open(argv[1], + std::ofstream::in); + + return 0; +} \ No newline at end of file diff --git a/benchmarks/analyze_better_blocks/tools/misc/local.cmake b/benchmarks/analyze_better_blocks/tools/misc/local.cmake new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/tools/mmapvector/CMakeLists.txt b/benchmarks/analyze_better_blocks/tools/mmapvector/CMakeLists.txt new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/tools/mmapvector/MMapVector.cpp b/benchmarks/analyze_better_blocks/tools/mmapvector/MMapVector.cpp new file mode 100644 index 0000000..c3463d6 --- /dev/null +++ b/benchmarks/analyze_better_blocks/tools/mmapvector/MMapVector.cpp @@ -0,0 +1,28 @@ +#include "MMapvector.hpp" +using namespace std; + +// ------------------------------------------------------------------------------------- +void writeBinary(const char *pathname, std::vector &v) +{ + std::cout << "Writing binary file : " << pathname << std::endl; + int fd = open(pathname, O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); + die_if(fd != -1); + uint64_t fileSize = 8 + 16 * v.size(); + for ( auto s:v ) + fileSize += s.size() + 1; + die_if(posix_fallocate(fd, 0, fileSize) == 0); + auto data = reinterpret_cast::Data *>(mmap(nullptr, fileSize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0)); + data->count = v.size(); + die_if(data != MAP_FAILED); + uint64_t offset = 8 + 16 * v.size(); + char *dst = reinterpret_cast(data); + uint64_t slot = 0; + for ( auto s:v ) { + data->slot[slot].size = s.size(); + data->slot[slot].offset = offset; + memcpy(dst + offset, s.data(), s.size()); + offset += s.size(); + slot++; + } + die_if(close(fd) == 0); +} \ No newline at end of file diff --git a/benchmarks/analyze_better_blocks/tools/mmapvector/MMapvector.hpp b/benchmarks/analyze_better_blocks/tools/mmapvector/MMapvector.hpp new file mode 100644 index 0000000..d1b30a3 --- /dev/null +++ b/benchmarks/analyze_better_blocks/tools/mmapvector/MMapvector.hpp @@ -0,0 +1,118 @@ +#pragma once +// ------------------------------------------------------------------------------------- +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +// ------------------------------------------------------------------------------------- +#define die_if(expr) if (!(expr)) { perror(#expr); assert(false); } + +using std::endl; +using std::cout; +template +struct Vector { + uint64_t count; + int fd; + T *data; + + Vector() + : data(nullptr) {} + Vector(const char *pathname) { readBinary(pathname); } + ~Vector() + { + delete[] data; + } + + void readBinary(const char *pathname) + { +// std::cout << "Reading binary file : " << pathname << std::endl; + fd = open(pathname, O_RDONLY); + if(fd == -1) { + cout << pathname << endl; + } + die_if(fd != -1); + struct stat sb; + die_if(fstat(fd, &sb) != -1); + count = static_cast(sb.st_size) / sizeof(T); + data = new T[count]; + die_if(read(fd, data, sb.st_size) == sb.st_size); + die_if(close(fd) == 0); + } + + uint64_t size() const { return count; } + T operator[](std::size_t idx) const { return data[idx]; } +}; + +template +void writeBinary(const char *pathname, std::vector &v) +{ + std::cout << "Writing binary file : " << pathname << std::endl; + int fd = open(pathname, O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); + die_if(fd != -1); + uint64_t length = v.size() * sizeof(T); + die_if(posix_fallocate(fd, 0, length) == 0); + T *data = reinterpret_cast(mmap(nullptr, length, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0)); + die_if(data != MAP_FAILED); + memcpy(data, v.data(), length); + die_if(close(fd) == 0); +} + + +typedef struct { + uint64_t size; + uint64_t offset; +} StringIndexSlot; + +template<> +struct Vector { + struct Data { + uint64_t count; + StringIndexSlot slot[]; + }; + + uint64_t fileSize; + int fd; + Data *data; + + Vector() + : data(nullptr) {} + Vector(const char *pathname) { readBinary(pathname); } + ~Vector() + { + if ( data ){ + die_if(munmap(data, fileSize) == 0); + die_if(close(fd) == 0); + } + } + + void readBinary(const char *pathname) + { +// std::cout << "Reading binary file : " << pathname << std::endl; + fd = open(pathname, O_RDONLY); + if(fd == -1) { + cout << pathname << endl; + } + die_if(fd != -1); + struct stat sb; + die_if(fstat(fd, &sb) != -1); + fileSize = static_cast(sb.st_size); + data = reinterpret_cast(mmap(nullptr, fileSize, PROT_READ, MAP_PRIVATE, fd, 0)); + die_if(data != MAP_FAILED); + } + + uint64_t size() { return data->count; } + std::string_view operator[](std::size_t idx) const + { + auto slot = data->slot[idx]; + return std::string_view(reinterpret_cast(data) + slot.offset, slot.size); + } +}; +void writeBinary(const char *pathname, std::vector &v); \ No newline at end of file diff --git a/benchmarks/analyze_better_blocks/tools/prepare-ec2-instance.sh b/benchmarks/analyze_better_blocks/tools/prepare-ec2-instance.sh new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/tools/r.bash b/benchmarks/analyze_better_blocks/tools/r.bash new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/tools/stats.py b/benchmarks/analyze_better_blocks/tools/stats.py new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/vendor/aws-sdk.cmake b/benchmarks/analyze_better_blocks/vendor/aws-sdk.cmake new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/vendor/benchmark.cmake b/benchmarks/analyze_better_blocks/vendor/benchmark.cmake new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/vendor/croaring.cmake b/benchmarks/analyze_better_blocks/vendor/croaring.cmake new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/vendor/fastpfor.cmake b/benchmarks/analyze_better_blocks/vendor/fastpfor.cmake new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/vendor/fsst.cmake b/benchmarks/analyze_better_blocks/vendor/fsst.cmake new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/vendor/gdouble.cmake b/benchmarks/analyze_better_blocks/vendor/gdouble.cmake new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/vendor/gflags.cmake b/benchmarks/analyze_better_blocks/vendor/gflags.cmake new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/vendor/googletest.cmake b/benchmarks/analyze_better_blocks/vendor/googletest.cmake new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/vendor/lz4.cmake b/benchmarks/analyze_better_blocks/vendor/lz4.cmake new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/vendor/rapidjson.cmake b/benchmarks/analyze_better_blocks/vendor/rapidjson.cmake new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/vendor/spdlog.cmake b/benchmarks/analyze_better_blocks/vendor/spdlog.cmake new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/vendor/tbb.cmake b/benchmarks/analyze_better_blocks/vendor/tbb.cmake new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/vendor/turbo.cmake b/benchmarks/analyze_better_blocks/vendor/turbo.cmake new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/analyze_better_blocks/vendor/yaml-cpp.cmake b/benchmarks/analyze_better_blocks/vendor/yaml-cpp.cmake new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/bench_compression_ratio/CMakeLists.txt b/benchmarks/bench_compression_ratio/CMakeLists.txt new file mode 100644 index 0000000..c7c9468 --- /dev/null +++ b/benchmarks/bench_compression_ratio/CMakeLists.txt @@ -0,0 +1,37 @@ +if (NOT DEFINED ENV{ALP_DATASET_DIR_PATH}) + message(FATAL_ERROR "You must set ALP_DATASET_DIR_PATH environment variable") +endif () + +add_executable(bench_alp_compression_ratio alp.cpp) +target_link_libraries(bench_alp_compression_ratio PUBLIC ALP gtest_main) +gtest_discover_tests(bench_alp_compression_ratio) + +# Test ALP32 on ML Data: ---------------------------------------------------------------------------------------------------------- +add_executable(bench_alp32_compression_ratio alp32.cpp) +target_link_libraries(bench_alp32_compression_ratio PUBLIC ALP gtest_main) +gtest_discover_tests(bench_alp32_compression_ratio) + +# Test ZSTD: ---------------------------------------------------------------------------------------------------------- +add_executable(bench_zstd_compression_ratio zstd.cpp) +target_link_libraries(bench_zstd_compression_ratio PRIVATE gtest_main libzstd) +gtest_discover_tests(bench_zstd_compression_ratio) + +# Test Chimp: ---------------------------------------------------------------------------------------------------------- +add_executable(bench_chimp_compression_ratio chimp.cpp) +target_link_libraries(bench_chimp_compression_ratio PRIVATE gtest_main) +gtest_discover_tests(bench_chimp_compression_ratio) + +# Test Chimp128: ---------------------------------------------------------------------------------------------------------- +add_executable(bench_chimp128_compression_ratio chimp128.cpp) +target_link_libraries(bench_chimp128_compression_ratio PRIVATE gtest_main) +gtest_discover_tests(bench_chimp128_compression_ratio) + +# Test Gorillas: ---------------------------------------------------------------------------------------------------------- +add_executable(bench_gorillas_compression_ratio gorillas.cpp) +target_link_libraries(bench_gorillas_compression_ratio PRIVATE gtest_main) +gtest_discover_tests(bench_gorillas_compression_ratio) + +# Test Patas: ---------------------------------------------------------------------------------------------------------- +add_executable(bench_patas_compression_ratio patas.cpp) +target_link_libraries(bench_patas_compression_ratio PRIVATE gtest_main) +gtest_discover_tests(bench_patas_compression_ratio) \ No newline at end of file diff --git a/benchmarks/bench_compression_ratio/alp.cpp b/benchmarks/bench_compression_ratio/alp.cpp new file mode 100644 index 0000000..12ab8bb --- /dev/null +++ b/benchmarks/bench_compression_ratio/alp.cpp @@ -0,0 +1,284 @@ +#include "alp.hpp" +#include "alp_result.hpp" +#include "data.hpp" +#include "test/mapper.hpp" +#include "gtest/gtest.h" +#include + +using namespace alp::config; +/* + * ALP overhead per vector in a hypothetic file format = bit_width + factor-idx + exponent-idx + ffor base; + */ +double overhead_per_vector {static_cast(8 + 8 + 8 + 64) / VECTOR_SIZE}; + +double calculate_alp_compression_size(std::vector& vector_metadata) { + double avg_bits_per_value {0}; + for (auto& metadata : vector_metadata) { + avg_bits_per_value = avg_bits_per_value + metadata.bit_width; + avg_bits_per_value = avg_bits_per_value + + (static_cast(metadata.exceptions_count) * + (alp::Constants::EXCEPTION_SIZE + alp::EXCEPTION_POSITION_SIZE) / VECTOR_SIZE); + } + + avg_bits_per_value = avg_bits_per_value / vector_metadata.size(); + avg_bits_per_value = avg_bits_per_value + overhead_per_vector; + return avg_bits_per_value; +} + +/* + * ALPRD Overhead per vector in a hypothetic file format in which the left parts dictionary is at the start of a + * rowgroup + */ +double alprd_overhead_per_vector {static_cast(MAX_RD_DICTIONARY_SIZE * 16) / ROWGROUP_SIZE}; + +double calculate_alprd_compression_size(std::vector& vector_metadata) { + double avg_bits_per_value {0}; + for (auto& metadata : vector_metadata) { + avg_bits_per_value = avg_bits_per_value + metadata.right_bit_width + metadata.left_bit_width + + static_cast(metadata.exceptions_count * + (alp::RD_EXCEPTION_SIZE + alp::RD_EXCEPTION_POSITION_SIZE)) / + VECTOR_SIZE; + } + + avg_bits_per_value = avg_bits_per_value / vector_metadata.size(); + avg_bits_per_value = avg_bits_per_value + alprd_overhead_per_vector; + + return avg_bits_per_value; +} + +double get_average_exception_count(std::vector& vector_metadata) { + double avg_exceptions_count {0}; + for (auto& metadata : vector_metadata) { + avg_exceptions_count = avg_exceptions_count + metadata.exceptions_count; + } + + avg_exceptions_count = avg_exceptions_count / vector_metadata.size(); + return avg_exceptions_count; +} + +class alp_test : public ::testing::Test { +public: + double* dbl_arr {}; + double* exc_arr {}; + uint16_t* rd_exc_arr {}; + uint16_t* pos_arr {}; + uint16_t* exc_c_arr {}; + int64_t* ffor_arr {}; + int64_t* unffor_arr {}; + int64_t* base_arr {}; + int64_t* encoded_arr {}; + double* dec_dbl_arr {}; + double* smp_arr {}; + uint64_t* ffor_right_arr {}; + uint16_t* ffor_left_arr {}; + uint64_t* right_arr {}; + uint16_t* left_arr {}; + uint64_t* unffor_right_arr {}; + uint16_t* unffor_left_arr {}; + double* glue_arr {}; + + alp::state state; + + uint8_t bit_width {}; + + void SetUp() override { + dbl_arr = new double[VECTOR_SIZE]; + exc_arr = new double[VECTOR_SIZE]; + rd_exc_arr = new uint16_t[VECTOR_SIZE]; + pos_arr = new uint16_t[VECTOR_SIZE]; + encoded_arr = new int64_t[VECTOR_SIZE]; + dec_dbl_arr = new double[VECTOR_SIZE]; + exc_c_arr = new uint16_t[VECTOR_SIZE]; + ffor_arr = new int64_t[VECTOR_SIZE]; + unffor_arr = new int64_t[VECTOR_SIZE]; + base_arr = new int64_t[VECTOR_SIZE]; + smp_arr = new double[VECTOR_SIZE]; + right_arr = new uint64_t[VECTOR_SIZE]; + left_arr = new uint16_t[VECTOR_SIZE]; + ffor_right_arr = new uint64_t[VECTOR_SIZE]; + ffor_left_arr = new uint16_t[VECTOR_SIZE]; + unffor_right_arr = new uint64_t[VECTOR_SIZE]; + unffor_left_arr = new uint16_t[VECTOR_SIZE]; + glue_arr = new double[VECTOR_SIZE]; + } + + ~alp_test() override { + delete[] dbl_arr; + delete[] exc_arr; + delete[] rd_exc_arr; + delete[] pos_arr; + delete[] encoded_arr; + delete[] dec_dbl_arr; + delete[] exc_c_arr; + delete[] ffor_arr; + delete[] unffor_arr; + delete[] base_arr; + delete[] smp_arr; + delete[] right_arr; + delete[] left_arr; + delete[] unffor_right_arr; + delete[] unffor_left_arr; + } +}; + +/* + * Test to encode and decode whole datasets using ALP + * This test will output and write a file with the estimated bits/value after compression with alp + */ + +TEST_F(alp_test, test_alp_on_whole_datasets) { + + if (const auto v = std::getenv("ALP_DATASET_DIR_PATH"); v == nullptr) { + throw std::runtime_error("Environment variable ALP_DATASET_DIR_PATH is not set!"); + } + + std::ofstream ofile(alp_bench::PATHS.RESULT_DIR_PATH + "alp_compression_ratio.csv", std::ios::out); + ofile << "dataset,size,rowgroups_count,vectors_count\n"; + + for (auto& dataset : alp_bench::alp_dataset) { + if (dataset.suitable_for_cutting) { continue; } + + std::cout << dataset.name << std::endl; + + std::vector compression_metadata; + size_t tuples_count; + auto* data_column = mapper::mmap_file(tuples_count, dataset.binary_file_path); + double value_to_encode {0.0}; + size_t vector_idx {0}; + size_t rowgroup_counter {0}; + size_t rowgroup_offset {0}; + alp::state stt; + size_t rowgroups_count = std::ceil(static_cast(tuples_count) / ROWGROUP_SIZE); + size_t vectors_count = tuples_count / VECTOR_SIZE; + /* Init */ + alp::AlpEncode::init(data_column, rowgroup_offset, tuples_count, smp_arr, stt); + /* Encode - Decode - Validate. */ + for (size_t i = 0; i < tuples_count; i++) { + value_to_encode = data_column[i]; + dbl_arr[vector_idx] = value_to_encode; + vector_idx = vector_idx + 1; + rowgroup_offset = rowgroup_offset + 1; + rowgroup_counter = rowgroup_counter + 1; + + if (vector_idx != VECTOR_SIZE) { continue; } + if (rowgroup_counter == ROWGROUP_SIZE) { + rowgroup_counter = 0; + alp::AlpEncode::init(data_column, rowgroup_offset, tuples_count, smp_arr, stt); + } + alp::AlpEncode::encode(dbl_arr, exc_arr, pos_arr, exc_c_arr, encoded_arr, stt); + alp::AlpEncode::analyze_ffor(encoded_arr, bit_width, base_arr); + ffor::ffor(encoded_arr, ffor_arr, bit_width, base_arr); + + unffor::unffor(ffor_arr, unffor_arr, bit_width, base_arr); + alp::AlpDecode::decode(unffor_arr, stt.fac, stt.exp, dec_dbl_arr); + alp::AlpDecode::patch_exceptions(dec_dbl_arr, exc_arr, pos_arr, exc_c_arr); + + for (size_t j = 0; j < VECTOR_SIZE; j++) { + auto l = dbl_arr[j]; + auto r = dec_dbl_arr[j]; + if (l != r) { std::cerr << j << ", " << rowgroup_offset << ", " << dataset.name << "\n"; } + ASSERT_EQ(dbl_arr[j], dec_dbl_arr[j]); + } + compression_metadata.push_back({bit_width, exc_c_arr[0]}); + vector_idx = 0; + bit_width = 0; + } + auto compression_ratio = calculate_alp_compression_size(compression_metadata); + + ofile << std::fixed << std::setprecision(2) << dataset.name << "," << compression_ratio << "," + << rowgroups_count << "," << vectors_count << std::endl; + + if (alp_bench::results.find(dataset.name) != + alp_bench::results.end()) { // To avoid error when tested dataset is not found on results + ASSERT_EQ(alp_bench::to_str(compression_ratio), alp_bench::results.find(dataset.name)->second); + } + } +} + +/* + * Test to encode and decode whole datasets using ALP RD (aka ALP Cutter) + * This test will output and write a file with the estimated bits/value after compression with alp + */ +TEST_F(alp_test, test_alprd_on_whole_datasets) { + std::ofstream ofile(alp_bench::PATHS.RESULT_DIR_PATH + "alp_rd_compression_ratio.csv", std::ios::out); + ofile << "dataset,size,rowgroups_count,vectors_count\n"; + + for (auto& dataset : alp_bench::alp_dataset) { + if (!dataset.suitable_for_cutting) { continue; } + + std::vector compression_metadata; + size_t tuples_count; + auto* data_column = mapper::mmap_file(tuples_count, dataset.binary_file_path); + double value_to_encode = 0.0; + size_t vector_idx {0}; + size_t rowgroup_counter {0}; + size_t rowgroup_offset {0}; + alp::state stt; + size_t rowgroups_count {1}; + size_t vectors_count {1}; + + /* Init */ + alp::AlpEncode::init(data_column, rowgroup_offset, tuples_count, smp_arr, stt); + + ASSERT_EQ(stt.scheme, alp::SCHEME::ALP_RD); + + alp::AlpRD::init(data_column, rowgroup_offset, tuples_count, smp_arr, stt); + + /* Encode - Decode - Validate. */ + for (size_t i = 0; i < tuples_count; i++) { + value_to_encode = data_column[i]; + dbl_arr[vector_idx] = value_to_encode; + vector_idx = vector_idx + 1; + rowgroup_offset = rowgroup_offset + 1; + rowgroup_counter = rowgroup_counter + 1; + + if (vector_idx != VECTOR_SIZE) { continue; } + + if (rowgroup_counter == ROWGROUP_SIZE) { + rowgroup_counter = 0; + rowgroups_count = rowgroups_count + 1; + } + + // Encode + alp::AlpRD::encode(dbl_arr, rd_exc_arr, pos_arr, exc_c_arr, right_arr, left_arr, stt); + ffor::ffor(right_arr, ffor_right_arr, stt.right_bit_width, &stt.right_for_base); + ffor::ffor(left_arr, ffor_left_arr, stt.left_bit_width, &stt.left_for_base); + + // Decode + unffor::unffor(ffor_right_arr, unffor_right_arr, stt.right_bit_width, &stt.right_for_base); + unffor::unffor(ffor_left_arr, unffor_left_arr, stt.left_bit_width, &stt.left_for_base); + alp::AlpRD::decode( + glue_arr, unffor_right_arr, unffor_left_arr, rd_exc_arr, pos_arr, exc_c_arr, stt); + + auto* dbl_glue_arr = reinterpret_cast(glue_arr); + for (size_t j = 0; j < VECTOR_SIZE; ++j) { + auto l = dbl_arr[j]; + auto r = dbl_glue_arr[j]; + if (l != r) { std::cerr << j << ", " << dataset.name << "\n"; } + + ASSERT_EQ(dbl_arr[j], dbl_glue_arr[j]); + } + + alp_bench::VectorMetadata vector_metadata; + vector_metadata.right_bit_width = stt.right_bit_width; + vector_metadata.left_bit_width = stt.left_bit_width; + vector_metadata.exceptions_count = stt.exceptions_count; + + compression_metadata.push_back(vector_metadata); + vector_idx = 0; + bit_width = 0; + + vectors_count = vectors_count + 1; + } + + auto compression_ratio = calculate_alprd_compression_size(compression_metadata); + + ofile << std::fixed << std::setprecision(2) << dataset.name << "," << compression_ratio << "," + << rowgroups_count << "," << vectors_count << std::endl; + + if (alp_bench::results.find(dataset.name) != + alp_bench::results.end()) { // To avoid error when tested dataset is not found on results + ASSERT_EQ(alp_bench::to_str(compression_ratio), alp_bench::results.find(dataset.name)->second); + } + } +} diff --git a/benchmarks/bench_compression_ratio/alp32.cpp b/benchmarks/bench_compression_ratio/alp32.cpp new file mode 100644 index 0000000..1d344b5 --- /dev/null +++ b/benchmarks/bench_compression_ratio/alp32.cpp @@ -0,0 +1,212 @@ +#include "alp.hpp" +#include "alp_result.hpp" +#include "data.hpp" +#include "test/mapper.hpp" +#include "gtest/gtest.h" +#include + +using namespace alp::config; +/* + * ALP overhead per vector in a hypothetic file format = bit_width + factor-idx + exponent-idx + ffor base; + */ +double overhead_per_vector {static_cast(8 + 8 + 8 + 32) / VECTOR_SIZE}; + +double calculate_alp_compression_size(std::vector& vector_metadata) { + double avg_bits_per_value {0}; + for (auto& metadata : vector_metadata) { + avg_bits_per_value = avg_bits_per_value + metadata.bit_width; + avg_bits_per_value = + avg_bits_per_value + (static_cast(metadata.exceptions_count) * + (alp::Constants::EXCEPTION_SIZE + alp::EXCEPTION_POSITION_SIZE) / VECTOR_SIZE); + } + + avg_bits_per_value = avg_bits_per_value / vector_metadata.size(); + avg_bits_per_value = avg_bits_per_value + overhead_per_vector; + return avg_bits_per_value; +} + +/* + * ALPRD Overhead per vector in a hypothetic file format in which the left parts dictionary is at the start of a + * rowgroup + */ +double alprd_overhead_per_vector {static_cast(MAX_RD_DICTIONARY_SIZE * 16) / ROWGROUP_SIZE}; + +double calculate_alprd_compression_size(std::vector& vector_metadata) { + double avg_bits_per_value {0}; + for (auto& metadata : vector_metadata) { + avg_bits_per_value = avg_bits_per_value + metadata.right_bit_width + metadata.left_bit_width + + static_cast(metadata.exceptions_count * + (alp::RD_EXCEPTION_SIZE + alp::RD_EXCEPTION_POSITION_SIZE)) / + VECTOR_SIZE; + } + + avg_bits_per_value = avg_bits_per_value / vector_metadata.size(); + avg_bits_per_value = avg_bits_per_value + alprd_overhead_per_vector; + + return avg_bits_per_value; +} + +double get_average_exception_count(std::vector& vector_metadata) { + double avg_exceptions_count {0}; + for (auto& metadata : vector_metadata) { + avg_exceptions_count = avg_exceptions_count + metadata.exceptions_count; + } + + avg_exceptions_count = avg_exceptions_count / vector_metadata.size(); + return avg_exceptions_count; +} + +class alp32_test : public ::testing::Test { +public: + float* dbl_arr {}; + float* exc_arr {}; + uint16_t* rd_exc_arr {}; + uint16_t* pos_arr {}; + uint16_t* exc_c_arr {}; + int64_t* ffor_arr {}; + int64_t* unffor_arr {}; + int64_t* base_arr {}; + int64_t* encoded_arr {}; + float* dec_dbl_arr {}; + float* smp_arr {}; + uint32_t* ffor_right_arr {}; + uint16_t* ffor_left_arr {}; + uint32_t* right_arr {}; + uint16_t* left_arr {}; + uint32_t* unffor_right_arr {}; + uint16_t* unffor_left_arr {}; + float* glue_arr {}; + + alp::state state; + + uint8_t bit_width {}; + + void SetUp() override { + dbl_arr = new float[VECTOR_SIZE]; + exc_arr = new float[VECTOR_SIZE]; + rd_exc_arr = new uint16_t[VECTOR_SIZE]; + pos_arr = new uint16_t[VECTOR_SIZE]; + encoded_arr = new int64_t[VECTOR_SIZE]; + dec_dbl_arr = new float[VECTOR_SIZE]; + exc_c_arr = new uint16_t[VECTOR_SIZE]; + ffor_arr = new int64_t[VECTOR_SIZE]; + unffor_arr = new int64_t[VECTOR_SIZE]; + base_arr = new int64_t[VECTOR_SIZE]; + smp_arr = new float[VECTOR_SIZE]; + right_arr = new uint32_t[VECTOR_SIZE]; + left_arr = new uint16_t[VECTOR_SIZE]; + ffor_right_arr = new uint32_t[VECTOR_SIZE]; + ffor_left_arr = new uint16_t[VECTOR_SIZE]; + unffor_right_arr = new uint32_t[VECTOR_SIZE]; + unffor_left_arr = new uint16_t[VECTOR_SIZE]; + glue_arr = new float[VECTOR_SIZE]; + } + + ~alp32_test() override { + delete[] dbl_arr; + delete[] exc_arr; + delete[] rd_exc_arr; + delete[] pos_arr; + delete[] encoded_arr; + delete[] dec_dbl_arr; + delete[] exc_c_arr; + delete[] ffor_arr; + delete[] unffor_arr; + delete[] base_arr; + delete[] smp_arr; + delete[] right_arr; + delete[] left_arr; + delete[] unffor_right_arr; + delete[] unffor_left_arr; + } +}; + +/* + * Test to encode and decode whole datasets using ALP RD (aka ALP Cutter) + * This test will output and write a file with the estimated bits/value after compression with alp + */ +TEST_F(alp32_test, test_alprd32_on_whole_datasets) { + std::ofstream ofile(alp_bench::PATHS.RESULT_DIR_PATH + "alp_rd32_compression_ratio.csv", std::ios::out); + ofile << "dataset,size,rowgroups_count,vectors_count\n"; + + for (auto& dataset : alp_bench::sp_datasets) { + if (!dataset.suitable_for_cutting) { continue; } + + std::cout << dataset.name << std::endl; + + std::vector compression_metadata; + size_t tuples_count; + auto* data_column = mapper::mmap_file(tuples_count, dataset.binary_file_path); + float value_to_encode = 0.0; + size_t vector_idx {0}; + size_t rowgroup_counter {0}; + size_t rowgroup_offset {0}; + alp::state stt; + size_t rowgroups_count {1}; + size_t vectors_count {1}; + + /* Init */ + alp::AlpEncode::init(data_column, rowgroup_offset, tuples_count, smp_arr, stt); + + ASSERT_EQ(stt.scheme, alp::SCHEME::ALP_RD); + + alp::AlpRD::init(data_column, rowgroup_offset, tuples_count, smp_arr, stt); + + /* Encode - Decode - Validate. */ + for (size_t i = 0; i < tuples_count; i++) { + value_to_encode = data_column[i]; + dbl_arr[vector_idx] = value_to_encode; + vector_idx = vector_idx + 1; + rowgroup_offset = rowgroup_offset + 1; + rowgroup_counter = rowgroup_counter + 1; + + if (vector_idx != VECTOR_SIZE) { continue; } + + if (rowgroup_counter == ROWGROUP_SIZE) { + rowgroup_counter = 0; + rowgroups_count = rowgroups_count + 1; + } + + // Encode + alp::AlpRD::encode(dbl_arr, rd_exc_arr, pos_arr, exc_c_arr, right_arr, left_arr, stt); + uint32_t right_for_base = 0; + ffor::ffor(right_arr, ffor_right_arr, stt.right_bit_width, &right_for_base); + ffor::ffor(left_arr, ffor_left_arr, stt.left_bit_width, &stt.left_for_base); + + // Decode + unffor::unffor(ffor_right_arr, unffor_right_arr, stt.right_bit_width, &right_for_base); + unffor::unffor(ffor_left_arr, unffor_left_arr, stt.left_bit_width, &stt.left_for_base); + alp::AlpRD::decode(glue_arr, unffor_right_arr, unffor_left_arr, rd_exc_arr, pos_arr, exc_c_arr, stt); + + auto* dbl_glue_arr = reinterpret_cast(glue_arr); + for (size_t j = 0; j < VECTOR_SIZE; ++j) { + auto l = dbl_arr[j]; + auto r = dbl_glue_arr[j]; + if (l != r) { std::cerr << j << ", " << dataset.name << "\n"; } + + ASSERT_EQ(dbl_arr[j], dbl_glue_arr[j]); + } + + alp_bench::VectorMetadata vector_metadata; + vector_metadata.right_bit_width = stt.right_bit_width; + vector_metadata.left_bit_width = stt.left_bit_width; + vector_metadata.exceptions_count = stt.exceptions_count; + + compression_metadata.push_back(vector_metadata); + vector_idx = 0; + bit_width = 0; + + vectors_count = vectors_count + 1; + } + + auto compression_ratio = calculate_alprd_compression_size(compression_metadata); + + ofile << std::fixed << std::setprecision(2) << dataset.name << "," << compression_ratio << "," + << rowgroups_count << "," << vectors_count << std::endl; + + if (alp_bench::results.find(dataset.name) != + alp_bench::results.end()) { // To avoid error when tested dataset is not found on results + ASSERT_EQ(alp_bench::to_str(compression_ratio), alp_bench::results.find(dataset.name)->second); + } + } +} diff --git a/benchmarks/bench_compression_ratio/chimp.cpp b/benchmarks/bench_compression_ratio/chimp.cpp new file mode 100644 index 0000000..6cf56ee --- /dev/null +++ b/benchmarks/bench_compression_ratio/chimp.cpp @@ -0,0 +1,153 @@ +#include "chimp/chimp.hpp" +#include "alp.hpp" +#include "data.hpp" +#include "test/mapper.hpp" +#include "gtest/gtest.h" + +class chimp_test : public ::testing::Test { +public: + uint8_t* data_arr; + uint8_t* flags_arr; + uint8_t* leading_zero_arr; + double* dbl_arr; + double* dec_dbl_p; + uint64_t* dec_arr; + uint64_t* uint64_p; + alp_bench::ChimpCompressionState state; + alp_bench::ChimpConstants::Flags* flags; + uint8_t* leading_zero_unpacked; + alp_bench::FlagBuffer flag_buffer; + alp_bench::LeadingZeroBuffer leading_zero_buffer; + alp_bench::ChimpDecompressionState chimp_de_state; + uint32_t leading_zero_index; + uint8_t leading_zero_block_count; + size_t leading_zero_block_size; + + void SetUp() override { + dbl_arr = new double[1024]; + data_arr = new uint8_t[8096]; + flags_arr = new uint8_t[1025]; + leading_zero_arr = new uint8_t[1024]; + dec_arr = new uint64_t[1024]; + leading_zero_unpacked = new uint8_t[1024]; + flags = new alp_bench::ChimpConstants::Flags[1024]; + } + + ~chimp_test() override { + delete[] dbl_arr; + delete[] data_arr; + delete[] flags_arr; + delete[] leading_zero_arr; + delete[] dec_arr; + } +}; + +/* + * Chimp overhead per vector in a hypothetic file format = leading_zero_block_count + size_of_data_block + + * start_of_data; Start of Data is needed if Data Blocks and Metadata are stored separately (like in DuckDB to optimize + * decoding speed) + */ +double chimp_overhead_per_vector {static_cast(8 + 16 + 16)}; + +TEST_F(chimp_test, test_chimp_on_whole_datasets) { + + if (const auto v = std::getenv("ALP_DATASET_DIR_PATH"); v == nullptr) { + throw std::runtime_error("Environment variable ALP_DATASET_DIR_PATH is not set!"); + } + + std::ofstream ofile(alp_bench::PATHS.RESULT_DIR_PATH + "chimp_compression_ratio.csv", std::ios::out); + ofile << "dataset,size,vectors_count\n"; + + for (auto& dataset : alp_bench::alp_dataset) { + + std::cout << dataset.name << std::endl; + + size_t compressed_data_size = 0; + + size_t tuples_count; + auto* data_column = mapper::mmap_file(tuples_count, dataset.binary_file_path); + double value_to_encode {0.0}; + size_t vector_idx {0}; + size_t rowgroup_offset {0}; + size_t vectors_count = {0}; + /* Encode - Decode - Validate. */ + for (size_t i = 0; i < tuples_count; i++) { + value_to_encode = data_column[i]; + dbl_arr[vector_idx] = value_to_encode; + vector_idx = vector_idx + 1; + rowgroup_offset = rowgroup_offset + 1; + + if (vector_idx != alp::config::VECTOR_SIZE) { continue; } + + // Init Encoding + state.Reset(); + state.output.SetStream(data_arr); + state.leading_zero_buffer.SetBuffer(leading_zero_arr); + state.flag_buffer.SetBuffer(flags_arr); + + /* + * + * Encode + * + */ + uint64_p = reinterpret_cast(dbl_arr); + for (size_t i {0}; i < alp::config::VECTOR_SIZE; ++i) { + alp_bench::ChimpCompression::Store(uint64_p[i], state); + } + + state.Flush(); + state.output.Flush(); + + // SUM COMPRESSION SIZE + compressed_data_size += 16; + size_t bytes_used_by_data = state.output.BytesWritten(); + size_t flag_bytes = state.flag_buffer.BytesUsed(); + size_t leading_zero_block_count = state.leading_zero_buffer.BlockCount(); + size_t bytes_used_by_leading_zero_blocks = 3 * leading_zero_block_count; + compressed_data_size += + (alp_bench::AlignValue(bytes_used_by_data) + flag_bytes + bytes_used_by_leading_zero_blocks) * 8; + compressed_data_size += chimp_overhead_per_vector; + + // Init decoding + leading_zero_block_count = state.leading_zero_buffer.BlockCount(); + leading_zero_block_size = static_cast(leading_zero_block_count) * 8; + leading_zero_index = 0; + chimp_de_state.input.SetStream(data_arr); + flag_buffer.SetBuffer(flags_arr); + leading_zero_buffer.SetBuffer(leading_zero_arr); + + flags[0] = alp_bench::ChimpConstants::Flags::VALUE_IDENTICAL; // First value doesn't require a flag + for (size_t i = 0; i < alp::config::VECTOR_SIZE - 1; i++) { + flags[1 + i] = (alp_bench::ChimpConstants::Flags)flag_buffer.Extract(); + } + + for (size_t i = 0; i < leading_zero_block_size; i++) { + leading_zero_unpacked[i] = + alp_bench::ChimpConstants::Decompression::LEADING_REPRESENTATION[leading_zero_buffer.Extract()]; + } + + for (size_t i = 0; i < alp::config::VECTOR_SIZE; i++) { + dec_arr[i] = alp_bench::ChimpDecompression::Load( + flags[i], leading_zero_unpacked, leading_zero_index, chimp_de_state); + } + + chimp_de_state.Reset(); + + dec_dbl_p = reinterpret_cast(dec_arr); + + for (size_t j = 0; j < alp::config::VECTOR_SIZE; j++) { + auto l = dbl_arr[j]; + auto r = dec_dbl_p[j]; + if (l != r) { std::cerr << j << ", " << rowgroup_offset << ", " << dataset.name << "\n"; } + ASSERT_EQ(dbl_arr[j], dec_dbl_p[j]); + } + vector_idx = 0; + vectors_count += 1; + } + auto processed_tuples = vectors_count * alp::config::VECTOR_SIZE; + auto compression_ratio = (double)compressed_data_size / processed_tuples; + + ofile << std::fixed << std::setprecision(2) << dataset.name << "," << compression_ratio << "," << vectors_count + << std::endl; + } +} diff --git a/benchmarks/bench_compression_ratio/chimp128.cpp b/benchmarks/bench_compression_ratio/chimp128.cpp new file mode 100644 index 0000000..119d533 --- /dev/null +++ b/benchmarks/bench_compression_ratio/chimp128.cpp @@ -0,0 +1,186 @@ +#include "chimp/chimp128.hpp" +#include "alp.hpp" +#include "data.hpp" +#include "test/mapper.hpp" +#include "gtest/gtest.h" + +class chimp128_test : public ::testing::Test { +public: + uint8_t* data_arr; + uint8_t* flags_arr; + uint8_t* leading_zero_arr; + uint16_t* packed_data_arr; + double* dbl_arr; + double* dec_dbl_p; + uint64_t* dec_arr; + uint64_t* uint64_p; + alp_bench::Chimp128CompressionState com_stt; + alp_bench::ChimpConstants::Flags* flags; + uint8_t* leading_zero_unpacked; + alp_bench::UnpackedData* unpacked_data_arr; + alp_bench::FlagBuffer flag_buffer; + alp_bench::LeadingZeroBuffer leading_zero_buffer; + alp_bench::Chimp128DecompressionState chimp_de_state; + uint32_t leading_zero_index; + uint8_t leading_zero_block_count; + size_t leading_zero_block_size; + + void SetUp() override { + dbl_arr = new double[1024]; + data_arr = new uint8_t[8096]; + flags_arr = new uint8_t[1025]; + leading_zero_arr = new uint8_t[1024]; + dec_arr = new uint64_t[1024]; + leading_zero_unpacked = new uint8_t[1024]; + flags = new alp_bench::ChimpConstants::Flags[1024]; + packed_data_arr = new uint16_t[1024]; + unpacked_data_arr = new alp_bench::UnpackedData[1024]; + } + + ~chimp128_test() override { + delete[] dbl_arr; + delete[] data_arr; + delete[] flags_arr; + delete[] leading_zero_arr; + delete[] dec_arr; + delete[] packed_data_arr; + delete[] unpacked_data_arr; + } +}; + +/* + * Chimp overhead per vector in a hypothetic file format = leading_zero_block_count + size_of_data_block + + * start_of_data; Start of Data is needed if Data Blocks and Metadata are stored separately (like in DuckDB to optimize + * decoding speed) + */ +double chimp128_overhead_per_vector {static_cast(8 + 16 + 16)}; + +TEST_F(chimp128_test, test_chimp128_on_whole_datasets) { + + if (const auto v = std::getenv("ALP_DATASET_DIR_PATH"); v == nullptr) { + throw std::runtime_error("Environment variable ALP_DATASET_DIR_PATH is not set!"); + } + + std::ofstream ofile(alp_bench::PATHS.RESULT_DIR_PATH + "chimp128_compression_ratio.csv", std::ios::out); + ofile << "dataset,size,vectors_count\n"; + + for (auto& dataset : alp_bench::alp_dataset) { + + std::cout << dataset.name << std::endl; + + size_t compressed_data_size = 0; + + size_t tuples_count; + auto* data_column = mapper::mmap_file(tuples_count, dataset.binary_file_path); + double value_to_encode {0.0}; + size_t vector_idx {0}; + size_t rowgroup_offset {0}; + size_t vectors_count = {0}; + /* Encode - Decode - Validate. */ + for (size_t i = 0; i < tuples_count; i++) { + value_to_encode = data_column[i]; + dbl_arr[vector_idx] = value_to_encode; + vector_idx = vector_idx + 1; + rowgroup_offset = rowgroup_offset + 1; + + if (vector_idx != alp::config::VECTOR_SIZE) { continue; } + + // Init Encoding + com_stt.Reset(); + com_stt.output.SetStream(data_arr); + com_stt.leading_zero_buffer.SetBuffer(leading_zero_arr); + com_stt.flag_buffer.SetBuffer(flags_arr); + com_stt.packed_data_buffer.SetBuffer(packed_data_arr); + + /* + * + * Encode + * + */ + uint64_p = reinterpret_cast(dbl_arr); + for (size_t i {0}; i < alp::config::VECTOR_SIZE; ++i) { + alp_bench::Chimp128Compression::Store(uint64_p[i], com_stt); + } + + com_stt.Flush(); + com_stt.output.Flush(); + + // SUM COMPRESSION SIZE + size_t bytes_used_by_data = com_stt.output.BytesWritten(); + size_t flag_bytes = com_stt.flag_buffer.BytesUsed(); + size_t leading_zero_block_count = com_stt.leading_zero_buffer.BlockCount(); + size_t bytes_used_by_leading_zero_blocks = 3 * leading_zero_block_count; + uint16_t packed_data_blocks_count = com_stt.packed_data_buffer.index; + uint64_t bytes_used_by_packed_data_blocks_count = packed_data_blocks_count * sizeof(uint16_t); + compressed_data_size += (alp_bench::AlignValue(bytes_used_by_data) + flag_bytes + + bytes_used_by_leading_zero_blocks + bytes_used_by_packed_data_blocks_count) * + 8; + compressed_data_size += chimp128_overhead_per_vector; + + // Init decoding + leading_zero_block_count = com_stt.leading_zero_buffer.BlockCount(); + leading_zero_block_size = static_cast(leading_zero_block_count) * 8; + uint32_t unpacked_index = 0; + leading_zero_index = 0; + chimp_de_state.input.SetStream(data_arr); + flag_buffer.SetBuffer(flags_arr); + leading_zero_buffer.SetBuffer(leading_zero_arr); + + // Decode flags + flags[0] = alp_bench::ChimpConstants::Flags::VALUE_IDENTICAL; // First value doesn't require a flag + for (size_t i = 0; i < alp::config::VECTOR_SIZE - 1; i++) { + flags[1 + i] = (alp_bench::ChimpConstants::Flags)flag_buffer.Extract(); + } + + // Decode leading zero + for (size_t i = 0; i < leading_zero_block_size; i++) { + leading_zero_unpacked[i] = + alp_bench::ChimpConstants::Decompression::LEADING_REPRESENTATION[leading_zero_buffer.Extract()]; + } + + /* + * count how many cases of 'TRAILING_EXCEEDS_THRESHOLD' are based on the flags + * that is the exact number of packed data blocks + * that is the case in which in Chimp128 they save data in a block of 16 bits + */ + size_t packed_data_block_count = 0; + for (size_t i = 0; i < alp::config::VECTOR_SIZE; i++) { + packed_data_block_count += flags[1 + i] == alp_bench::ChimpConstants::Flags::TRAILING_EXCEEDS_THRESHOLD; + } + + for (size_t i = 0; i < packed_data_block_count; i++) { + alp_bench::PackedDataUtils::Unpack(((uint16_t*)packed_data_arr)[i], unpacked_data_arr[i]); + if (unpacked_data_arr[i].significant_bits == 0) { unpacked_data_arr[i].significant_bits = 64; } + unpacked_data_arr[i].leading_zero = + alp_bench::ChimpConstants::Decompression::LEADING_REPRESENTATION[unpacked_data_arr[i].leading_zero]; + } + + chimp_de_state.Reset(); + + for (size_t i = 0; i < alp::config::VECTOR_SIZE; i++) { + dec_arr[i] = alp_bench::Chimp128Decompression::Load(flags[i], + leading_zero_unpacked, + leading_zero_index, + unpacked_data_arr, + unpacked_index, + chimp_de_state); + } + + dec_dbl_p = reinterpret_cast(dec_arr); + + for (size_t j = 0; j < alp::config::VECTOR_SIZE; j++) { + auto l = dbl_arr[j]; + auto r = dec_dbl_p[j]; + if (l != r) { std::cerr << j << ", " << rowgroup_offset << ", " << dataset.name << "\n"; } + ASSERT_EQ(dbl_arr[j], dec_dbl_p[j]); + } + vector_idx = 0; + vectors_count += 1; + } + auto processed_tuples = vectors_count * alp::config::VECTOR_SIZE; + auto compression_ratio = (double)compressed_data_size / processed_tuples; + + ofile << std::fixed << std::setprecision(2) << dataset.name << "," << compression_ratio << "," << vectors_count + << std::endl; + } +} diff --git a/benchmarks/bench_compression_ratio/gorillas.cpp b/benchmarks/bench_compression_ratio/gorillas.cpp new file mode 100644 index 0000000..6fce113 --- /dev/null +++ b/benchmarks/bench_compression_ratio/gorillas.cpp @@ -0,0 +1,133 @@ +#include "gorillas/gorillas.hpp" +#include "alp.hpp" +#include "data.hpp" +#include "test/mapper.hpp" +#include "gtest/gtest.h" + +class gorillas_test : public ::testing::Test { +public: + double* dbl_arr; + double* dec_dbl_p; + uint64_t* dec_arr; + uint64_t* uint64_p; + alp_bench::GorillasCompressionState state; + alp_bench::GorillasConstants::Flags* flags; + alp_bench::GorillasDecompressionState gorillas_de_state; + uint8_t* data_arr; + uint8_t* flags_arr; + + void SetUp() override { + dbl_arr = new double[1024]; + data_arr = new uint8_t[8192 + 1024]; // We leave some headroom in case of negative compression + dec_arr = new uint64_t[1024]; + flags = new alp_bench::GorillasConstants::Flags[1024]; + flags_arr = new uint8_t[1024]; + } + + ~gorillas_test() override { + delete[] dbl_arr; + delete[] data_arr; + delete[] flags_arr; + delete[] dec_arr; + } +}; + +/* + * Gorillas overhead per vector in a hypothetic file format = size_of_data_block + start_of_data + size_of_data; + * Start of Data is needed if Data Blocks and Metadata are stored separately (like in DuckDB to optimize decoding speed) + */ +double gorillas_overhead_per_vector {static_cast(16 + 16 + 16)}; + +TEST_F(gorillas_test, test_gorillas_on_whole_datasets) { + + if (const auto v = std::getenv("ALP_DATASET_DIR_PATH"); v == nullptr) { + throw std::runtime_error("Environment variable ALP_DATASET_DIR_PATH is not set!"); + } + + std::ofstream ofile(alp_bench::PATHS.RESULT_DIR_PATH + "gorillas_compression_ratio.csv", std::ios::out); + ofile << "dataset,size,vectors_count\n"; + + for (auto& dataset : alp_bench::alp_dataset) { + + std::cout << dataset.name << std::endl; + + size_t compressed_data_size = 0; + + size_t tuples_count; + auto* data_column = mapper::mmap_file(tuples_count, dataset.binary_file_path); + double value_to_encode {0.0}; + size_t vector_idx {0}; + size_t rowgroup_offset {0}; + size_t vectors_count = {0}; + /* Encode - Decode - Validate. */ + for (size_t i = 0; i < tuples_count; i++) { + value_to_encode = data_column[i]; + dbl_arr[vector_idx] = value_to_encode; + vector_idx = vector_idx + 1; + rowgroup_offset = rowgroup_offset + 1; + + if (vector_idx != alp::config::VECTOR_SIZE) { continue; } + + // Init Encoding + state.Reset(); + state.output.SetStream(data_arr); + state.flag_buffer.SetBuffer(flags_arr); + + /* + * + * Encode + * + */ + uint64_p = reinterpret_cast(dbl_arr); + for (size_t i {0}; i < alp::config::VECTOR_SIZE; i++) { + alp_bench::GorillasCompression::Store(uint64_p[i], state); + } + + state.Flush(); + state.output.Flush(); + + // SUM COMPRESSION SIZE + size_t bytes_used_by_data = state.output.BytesWritten(); + size_t flag_bytes = state.flag_buffer.BytesUsed(); + compressed_data_size += (alp_bench::AlignValue(bytes_used_by_data) + flag_bytes) * 8; + compressed_data_size += gorillas_overhead_per_vector; + + // Init decoding + gorillas_de_state.input.SetStream(data_arr); + alp_bench::FlagBuffer flag_buffer; + flag_buffer.SetBuffer(flags_arr); + + /* + * + * DECODE + * + */ + flags[0] = alp_bench::GorillasConstants::Flags::VALUE_IDENTICAL; // First value doesn't require a flag + for (size_t i = 0; i < alp::config::VECTOR_SIZE - 1; i++) { + flags[1 + i] = (alp_bench::GorillasConstants::Flags)flag_buffer.Extract(); + } + + gorillas_de_state.Reset(); + + for (size_t i = 0; i < alp::config::VECTOR_SIZE; i++) { + dec_arr[i] = alp_bench::GorillasDecompression::Load(flags[i], gorillas_de_state); + } + + dec_dbl_p = reinterpret_cast(dec_arr); + + for (size_t j = 0; j < alp::config::VECTOR_SIZE; j++) { + auto l = dbl_arr[j]; + auto r = dec_dbl_p[j]; + if (l != r) { std::cerr << j << ", " << rowgroup_offset << ", " << dataset.name << "\n"; } + ASSERT_EQ(dbl_arr[j], dec_dbl_p[j]); + } + vector_idx = 0; + vectors_count += 1; + } + auto processed_tuples = vectors_count * alp::config::VECTOR_SIZE; + auto compression_ratio = (double)compressed_data_size / processed_tuples; + + ofile << std::fixed << std::setprecision(2) << dataset.name << "," << compression_ratio << "," << vectors_count + << std::endl; + } +} diff --git a/benchmarks/bench_compression_ratio/patas.cpp b/benchmarks/bench_compression_ratio/patas.cpp new file mode 100644 index 0000000..5ad57b8 --- /dev/null +++ b/benchmarks/bench_compression_ratio/patas.cpp @@ -0,0 +1,139 @@ +#include "patas/patas.hpp" +#include "alp.hpp" +#include "data.hpp" +#include "test/mapper.hpp" +#include "gtest/gtest.h" + +class patas_test : public ::testing::Test { +public: + uint8_t* data_arr; + uint16_t* packed_data_arr; + double* dbl_arr; + double* dec_dbl_p; + uint64_t* uint64_p; + uint64_t* dec_arr; + + // Encode + uint16_t* packed_metadata; + alp_bench::patas::PatasCompressionState patas_state; + alp_bench::patas::PatasUnpackedValueStats* unpacked_data; + + // Decode + alp_bench::ByteReader byte_reader; + + void SetUp() override { + dbl_arr = new double[1024]; + data_arr = new uint8_t[8192 + 2048]; // We leave some overhead room in case of negative compression + packed_data_arr = new uint16_t[1024]; + packed_metadata = new uint16_t[1024]; + dec_arr = new uint64_t[1024]; + unpacked_data = new alp_bench::patas::PatasUnpackedValueStats[1024]; + } + + ~patas_test() override { + delete[] dbl_arr; + delete[] dec_arr; + delete[] data_arr; + delete[] packed_data_arr; + delete[] packed_metadata; + delete[] unpacked_data; + } +}; + +/* + * Patas overhead per vector in a hypothetic file format = next_block_offset; + * Next block offset is needed to be able to skip blocks of data + */ +double patas_overhead_per_vector {static_cast(16)}; + +TEST_F(patas_test, test_patas_on_whole_datasets) { + + if (const auto v = std::getenv("ALP_DATASET_DIR_PATH"); v == nullptr) { + throw std::runtime_error("Environment variable ALP_DATASET_DIR_PATH is not set!"); + } + + std::ofstream ofile(alp_bench::PATHS.RESULT_DIR_PATH + "patas_compression_ratio.csv", std::ios::out); + ofile << "dataset,size,vectors_count\n"; + + for (auto& dataset : alp_bench::alp_dataset) { + + std::cout << dataset.name << std::endl; + + size_t compressed_data_size = 0; + + size_t tuples_count; + auto* data_column = mapper::mmap_file(tuples_count, dataset.binary_file_path); + double value_to_encode {0.0}; + size_t vector_idx {0}; + size_t rowgroup_offset {0}; + size_t vectors_count = {0}; + /* Encode - Decode - Validate. */ + for (size_t i = 0; i < tuples_count; i++) { + value_to_encode = data_column[i]; + dbl_arr[vector_idx] = value_to_encode; + vector_idx = vector_idx + 1; + rowgroup_offset = rowgroup_offset + 1; + + if (vector_idx != alp::config::VECTOR_SIZE) { continue; } + + // Init Encoding + patas_state.SetOutputBuffer(data_arr); + patas_state.packed_data_buffer.SetBuffer(packed_metadata); + patas_state.Reset(); + + /* + * + * Encode + * + */ + uint64_p = reinterpret_cast(dbl_arr); + for (size_t i {0}; i < alp::config::VECTOR_SIZE; ++i) { + alp_bench::patas::PatasCompression::Store(uint64_p[i], patas_state); + } + + // SUM COMPRESSION SIZE + size_t bytes_used_by_data = patas_state.byte_writer.BytesWritten(); + size_t packed_data_size = patas_state.packed_data_buffer.index * sizeof(uint16_t); + compressed_data_size += (alp_bench::AlignValue(bytes_used_by_data) + packed_data_size) * 8; + compressed_data_size += patas_overhead_per_vector; + + // Init decoding + byte_reader.SetStream(data_arr); + + /* + * + * DECODE + * + */ + // UNPACKING METADATA (16 bits) + for (size_t i = 0; i < alp::config::VECTOR_SIZE; i++) { + alp_bench::PackedDataUtils::Unpack(packed_metadata[i], + (alp_bench::UnpackedData&)unpacked_data[i]); + } + + // USING UNPACKED METADATA AND DATA BUFFER WE LOAD THE DOUBLE VALUES + dec_arr[0] = (uint64_t)0; + for (size_t i = 0; i < alp::config::VECTOR_SIZE; i++) { + dec_arr[i] = alp_bench::patas::PatasDecompression::DecompressValue( + byte_reader, + unpacked_data[i].significant_bytes, + unpacked_data[i].trailing_zeros, + dec_arr[i - unpacked_data[i].index_diff]); + } + + for (size_t j = 0; j < alp::config::VECTOR_SIZE; j++) { + if (uint64_p[j] != dec_arr[j]) { + std::cout << j << ", " << rowgroup_offset << ", " << dataset.name << std::endl; + } + ASSERT_EQ(uint64_p[j], dec_arr[j]); + } + vector_idx = 0; + vectors_count += 1; + } + auto processed_tuples = vectors_count * alp::config::VECTOR_SIZE; + auto compression_ratio = (double)compressed_data_size / processed_tuples; + + ofile << std::fixed << std::setprecision(2) << dataset.name << "," << compression_ratio << "," << vectors_count + << std::endl; + } +} diff --git a/benchmarks/bench_compression_ratio/zstd.cpp b/benchmarks/bench_compression_ratio/zstd.cpp new file mode 100644 index 0000000..bc1d056 --- /dev/null +++ b/benchmarks/bench_compression_ratio/zstd.cpp @@ -0,0 +1,90 @@ +#include "zstd.h" +#include "alp.hpp" +#include "data.hpp" +#include "test/mapper.hpp" +#include "gtest/gtest.h" + +class zstd_test : public ::testing::Test { +public: + double* dbl_arr; + void* enc_dbl_arr; + void* dec_dbl_arr; + size_t ZSTD_VECTOR_SIZE = + alp::config::ROWGROUP_SIZE; // For Zstd we compress rowgroups since it would be unfair to compress small vectors + size_t ENC_SIZE_UPPER_BOUND = ZSTD_VECTOR_SIZE * 8; + size_t INPUT_SIZE = ZSTD_VECTOR_SIZE * 8; + size_t DEC_SIZE = INPUT_SIZE; + + void SetUp() override { + dbl_arr = new double[ZSTD_VECTOR_SIZE]; + enc_dbl_arr = malloc(INPUT_SIZE); + dec_dbl_arr = malloc(INPUT_SIZE); + + const auto v = std::getenv("ALP_DATASET_DIR_PATH"); + if (v == nullptr) { throw std::runtime_error("Environment variable ALP_DATASET_DIR_PATH is not set!"); } + alp_bench::PATHS.ALP_DATASET_BINARY_DIR_PATH = v; + } + + ~zstd_test() override { + delete[] dbl_arr; + free(enc_dbl_arr); + free(dec_dbl_arr); + } +}; + +TEST_F(zstd_test, test_zstd_on_whole_datasets) { + std::ofstream ofile(alp_bench::PATHS.RESULT_DIR_PATH + "zstd_compression_ratio.csv", std::ios::out); + ofile << "dataset,size\n"; + + for (auto& dataset : alp_bench::alp_dataset) { + if (dataset.name.find("bw") != std::string::npos) { continue; } + + size_t tuples_count; + const auto* data_column = mapper::mmap_file(tuples_count, dataset.binary_file_path); + double value_to_encode = 0.0; + size_t vector_idx {0}; + size_t processed_tuples = 0; + + size_t compressed_data_size = 0; + + std::cout << dataset.name << "\n"; + + if (tuples_count < ZSTD_VECTOR_SIZE) { + ZSTD_VECTOR_SIZE = tuples_count; + INPUT_SIZE = ZSTD_VECTOR_SIZE * 8; + ENC_SIZE_UPPER_BOUND = ZSTD_VECTOR_SIZE * 8; + } + + /* Encode - Decode - Validate. */ + for (size_t i = 0; i < tuples_count; i++) { + value_to_encode = data_column[i]; + dbl_arr[vector_idx] = value_to_encode; + vector_idx = vector_idx + 1; + + if (vector_idx != ZSTD_VECTOR_SIZE) { continue; } + + processed_tuples += ZSTD_VECTOR_SIZE; + + // Encode + size_t const ENC_SIZE = ZSTD_compress(enc_dbl_arr, ENC_SIZE_UPPER_BOUND, dbl_arr, INPUT_SIZE, 3); // Level 3 + + // SUM COMPRESSED SIZE + compressed_data_size += ENC_SIZE * 8; + + // Decode + ZSTD_decompress(dec_dbl_arr, DEC_SIZE, enc_dbl_arr, ENC_SIZE); + + const auto* dec_dbl_arr_tmp = static_cast(dec_dbl_arr); + for (size_t j = 0; j < ZSTD_VECTOR_SIZE; ++j) { + const auto l = dbl_arr[j]; + if (const auto r = dec_dbl_arr_tmp[j]; l != r) { std::cerr << j << ", " << dataset.name << "\n"; } + ASSERT_EQ(dbl_arr[j], dec_dbl_arr_tmp[j]); + } + vector_idx = 0; + } + + auto compression_ratio = (double)compressed_data_size / processed_tuples; + + ofile << std::fixed << std::setprecision(2) << dataset.name << "," << compression_ratio << std::endl; + } +} diff --git a/benchmarks/bench_speed/CMakeLists.txt b/benchmarks/bench_speed/CMakeLists.txt new file mode 100644 index 0000000..47cb6f2 --- /dev/null +++ b/benchmarks/bench_speed/CMakeLists.txt @@ -0,0 +1,53 @@ +# Bench ALP -------------------------------------------------------------------------------------------------------- +configure_file(${CMAKE_SOURCE_DIR}/benchmarks/fls_bench/fls_bench.hpp ${CMAKE_CURRENT_BINARY_DIR}/bench_alp.hpp) +add_executable(bench_alp_encode bench_alp_encode.cpp) +target_include_directories(bench_alp_encode PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) +target_include_directories(bench_alp_encode PRIVATE ${CMAKE_SOURCE_DIR}/include) +target_link_libraries(bench_alp_encode PRIVATE ALP) + +# Bench ALP CUTTER ENCODE ---------------------------------------------------------------------------------------------- +configure_file(${CMAKE_SOURCE_DIR}/benchmarks/fls_bench/fls_bench.hpp ${CMAKE_CURRENT_BINARY_DIR}/bench_alp.hpp) +add_executable(bench_alp_cutter_encode bench_alp_cutter_encode.cpp) +target_include_directories(bench_alp_cutter_encode PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) +target_include_directories(bench_alp_cutter_encode PRIVATE ${CMAKE_SOURCE_DIR}/include) +target_link_libraries(bench_alp_cutter_encode PRIVATE ALP) + +# Bench ALP CUTTER DECODE ---------------------------------------------------------------------------------------------- +configure_file(${CMAKE_SOURCE_DIR}/benchmarks/fls_bench/fls_bench.hpp ${CMAKE_CURRENT_BINARY_DIR}/bench_alp.hpp) +add_executable(bench_alp_cutter_decode bench_alp_cutter_decode.cpp) +target_include_directories(bench_alp_cutter_decode PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) +target_include_directories(bench_alp_cutter_decode PRIVATE ${CMAKE_SOURCE_DIR}/include) +target_link_libraries(bench_alp_cutter_decode PRIVATE ALP) + +# Bench ALP without SAMPLING -------------------------------------------------------------------------------------------------------- +configure_file(${CMAKE_SOURCE_DIR}/benchmarks/fls_bench/fls_bench.hpp ${CMAKE_CURRENT_BINARY_DIR}/bench_alp_tmp.hpp) +add_executable(bench_alp_without_sampling bench_alp_without_sampling.cpp) +target_include_directories(bench_alp_without_sampling PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) +target_include_directories(bench_alp_without_sampling PRIVATE ${CMAKE_SOURCE_DIR}/include) +target_link_libraries(bench_alp_without_sampling PRIVATE ALP) + +# Bench PATAS ---------------------------------------------------------------------------------------------------------- +configure_file(${CMAKE_SOURCE_DIR}/benchmarks/fls_bench/fls_bench.hpp ${CMAKE_CURRENT_BINARY_DIR}/bench_patas.hpp) +add_executable(bench_patas bench_patas.cpp) +target_include_directories(bench_patas PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) + +# Bench CHIMP128 ------------------------------------------------------------------------------------------------------- +configure_file(${CMAKE_SOURCE_DIR}/benchmarks/fls_bench/fls_bench.hpp ${CMAKE_CURRENT_BINARY_DIR}/bench_chimp128.hpp) +add_executable(bench_chimp128 bench_chimp128.cpp) +target_include_directories(bench_chimp128 PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) + +# Bench CHIMP ---------------------------------------------------------------------------------------------------------- +configure_file(${CMAKE_SOURCE_DIR}/benchmarks/fls_bench/fls_bench.hpp ${CMAKE_CURRENT_BINARY_DIR}/bench_chimp.hpp) +add_executable(bench_chimp bench_chimp.cpp) +target_include_directories(bench_chimp PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) + +# Bench GORILLAS ------------------------------------------------------------------------------------------------------- +configure_file(${CMAKE_SOURCE_DIR}/benchmarks/fls_bench/fls_bench.hpp ${CMAKE_CURRENT_BINARY_DIR}/bench_gorillas.hpp) +add_executable(bench_gorillas bench_gorillas.cpp) +target_include_directories(bench_gorillas PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) + +# Test ZSTD: ---------------------------------------------------------------------------------------------------------- +configure_file(${CMAKE_SOURCE_DIR}/benchmarks/fls_bench/fls_bench.hpp ${CMAKE_CURRENT_BINARY_DIR}/bench_zstd.hpp) +add_executable(bench_zstd bench_zstd.cpp) +target_include_directories(bench_zstd PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) +target_link_libraries(bench_zstd PRIVATE libzstd) diff --git a/benchmarks/bench_speed/bench_alp_cutter_decode.cpp b/benchmarks/bench_speed/bench_alp_cutter_decode.cpp new file mode 100644 index 0000000..b1be869 --- /dev/null +++ b/benchmarks/bench_speed/bench_alp_cutter_decode.cpp @@ -0,0 +1,168 @@ +#include "alp.hpp" +#include "bench_alp.hpp" +#include "data.hpp" + +using namespace alp::config; + +/* Bench ALP encode. */ +static __attribute__((noinline)) benchmark::BenchmarkReporter::Run b_a_e(const double* dbl_arr, + uint16_t* exc_arr, + uint16_t* pos_arr, + uint16_t* exc_c_arr, + int64_t* encoded_arr, + uint8_t& bw, + int64_t* ffor_arr, + int64_t* base_arr, + alp::state& stt, + alp_bench::Column& dataset, + uint64_t* ffor_right_arr, + uint16_t* ffor_left_arr, + uint64_t* right_arr, + uint16_t* left_arr, + uint64_t* unffor_right_arr, + uint16_t* unffor_left_arr, + double* glue_arr) { + + int benchmark_number = dataset.id; + +#ifdef NDEBUG + uint64_t iterations = 300000; +#else + uint64_t iterations = 1; +#endif + + std::string benchmark_name = dataset.name + "_decode"; + + uint64_t cycles = benchmark::cycleclock::Now(); + for (uint64_t i = 0; i < iterations; ++i) { + unffor::unffor(ffor_right_arr, unffor_right_arr, stt.right_bit_width, &stt.right_for_base); + unffor::unffor(ffor_left_arr, unffor_left_arr, stt.left_bit_width, &stt.left_for_base); + alp::AlpRD::decode(glue_arr, unffor_right_arr, unffor_left_arr, exc_arr, pos_arr, exc_c_arr, stt); + } + + cycles = benchmark::cycleclock::Now() - cycles; + + return benchmark::BenchmarkReporter::Run( + benchmark_number, benchmark_name, iterations, double(cycles) / (double(iterations) * 1024)); +} + +void benchmark_all(benchmark::Benchmark& benchmark) { + double* dbl_arr; + uint16_t* exc_arr; + uint16_t* pos_arr; + uint16_t* exc_c_arr; + int64_t* ffor_arr; + int64_t* base_arr; + int64_t* encoded_arr; + double* dec_dbl_arr; + double* rg_smp_arr; + uint64_t* ffor_right_arr; + uint16_t* ffor_left_arr; + uint64_t* right_arr; + uint16_t* left_arr; + uint64_t* unffor_right_arr; + uint16_t* unffor_left_arr; + double* glue_arr; + + uint8_t bw; + + dbl_arr = new (std::align_val_t {64}) double[1024]; + exc_arr = new (std::align_val_t {64}) uint16_t[1024]; + pos_arr = new (std::align_val_t {64}) uint16_t[1024]; + encoded_arr = new (std::align_val_t {64}) int64_t[1024]; + dec_dbl_arr = new (std::align_val_t {64}) double[1024]; + exc_c_arr = new (std::align_val_t {64}) uint16_t[1024]; + ffor_arr = new (std::align_val_t {64}) int64_t[1024]; + base_arr = new (std::align_val_t {64}) int64_t[1024]; + rg_smp_arr = new (std::align_val_t {64}) double[VECTOR_SIZE]; + right_arr = new (std::align_val_t {64}) uint64_t[VECTOR_SIZE]; + left_arr = new (std::align_val_t {64}) uint16_t[VECTOR_SIZE]; + ffor_right_arr = new (std::align_val_t {64}) uint64_t[VECTOR_SIZE]; + ffor_left_arr = new (std::align_val_t {64}) uint16_t[VECTOR_SIZE]; + unffor_right_arr = new (std::align_val_t {64}) uint64_t[VECTOR_SIZE]; + unffor_left_arr = new (std::align_val_t {64}) uint16_t[VECTOR_SIZE]; + glue_arr = new (std::align_val_t {64}) double[VECTOR_SIZE]; + + for (auto& dataset : alp_bench::alp_dataset) { + std::ifstream ifile(dataset.sample_csv_file_path, std::ios::in); + if (!dataset.suitable_for_cutting) { continue; } + if (dataset.name.find("bw") != std::string::npos) { continue; } + + // check to see that the file was opened correctly: + if (!ifile.is_open()) { + std::cerr << "There was a problem opening the input file!\n"; + exit(1); // exit or do additional error checking + } + + double num = 0.0; + // keep storing values from the text file so long as data exists: + size_t c {0}; + while (ifile >> num) { + dbl_arr[c] = num; + c += 1; + } + + size_t n_values = 1024; + + size_t global_c {0}; + alp::state stt; + + alp::AlpRD::init(dbl_arr, global_c, n_values, rg_smp_arr, stt); + + alp::AlpRD::encode(dbl_arr, exc_arr, pos_arr, exc_c_arr, right_arr, left_arr, stt); + ffor::ffor(right_arr, ffor_right_arr, stt.right_bit_width, &stt.right_for_base); + ffor::ffor(left_arr, ffor_left_arr, stt.left_bit_width, &stt.left_for_base); + + // benchmark alp encode + benchmark.Run(b_a_e(dbl_arr, + exc_arr, + pos_arr, + exc_c_arr, + encoded_arr, + bw, + ffor_arr, + base_arr, + stt, + dataset, + ffor_right_arr, + ffor_left_arr, + right_arr, + left_arr, + unffor_right_arr, + unffor_left_arr, + glue_arr)); + + // decode + + // Validate + for (size_t j = 0; j < VECTOR_SIZE; ++j) { + auto l = dbl_arr[j]; + auto r = glue_arr[j]; + if (l != r) { + std::cerr << j << ", " << global_c << ", " << dataset.name << "\n"; + std::exit(-1); + } + } + + auto exceptions_count = exc_c_arr[0]; + if (dataset.exceptions_count != exceptions_count) { + std::cout << dataset.name << " with exceptions_count : " << dataset.exceptions_count << " should be " + << exceptions_count << "\n"; + } + + if (dataset.bit_width != bw) { + std::cout << dataset.name << " with bw " << static_cast(dataset.bit_width) << " should be " + << static_cast(bw) << "\n"; + } + } +} + +int main() { + benchmark::Benchmark benchmark = + benchmark::create("alp_decode_cutter") + .save() + .at(std::string(SOURCE_DIR) + "/alp_pub/results/" + benchmark::CmakeInfo::getCmakeToolchainFile()) + .print() + .add_extra_info(benchmark::CmakeInfo::getCmakeInfo()); + benchmark_all(benchmark); +} diff --git a/benchmarks/bench_speed/bench_alp_cutter_encode.cpp b/benchmarks/bench_speed/bench_alp_cutter_encode.cpp new file mode 100644 index 0000000..418276d --- /dev/null +++ b/benchmarks/bench_speed/bench_alp_cutter_encode.cpp @@ -0,0 +1,164 @@ +#include "alp.hpp" +#include "bench_alp.hpp" +#include "data.hpp" + +using namespace alp::config; +/* Bench ALP encode. */ +static __attribute__((noinline)) benchmark::BenchmarkReporter::Run b_a_e(const double* dbl_arr, + uint16_t* exc_arr, + uint16_t* pos_arr, + uint16_t* exc_c_arr, + int64_t* encoded_arr, + uint8_t& bw, + int64_t* ffor_arr, + int64_t* base_arr, + alp::state& stt, + alp_bench::Column& dataset, + uint64_t* ffor_right_arr, + uint16_t* ffor_left_arr, + uint64_t* right_arr, + uint16_t* left_arr, + uint64_t* unffor_right_arr, + uint16_t* unffor_left_arr) { + + int benchmark_number = dataset.id; + +#ifdef NDEBUG + uint64_t iterations = 300000; +#else + uint64_t iterations = 1; +#endif + + std::string benchmark_name = dataset.name + "_encode"; + + uint64_t cycles = benchmark::cycleclock::Now(); + for (uint64_t i = 0; i < iterations; ++i) { + alp::AlpRD::encode(dbl_arr, exc_arr, pos_arr, exc_c_arr, right_arr, left_arr, stt); + ffor::ffor(right_arr, ffor_right_arr, stt.right_bit_width, &stt.right_for_base); + ffor::ffor(left_arr, ffor_left_arr, stt.left_bit_width, &stt.left_for_base); + } + + cycles = benchmark::cycleclock::Now() - cycles; + + return benchmark::BenchmarkReporter::Run( + benchmark_number, benchmark_name, iterations, double(cycles) / (double(iterations) * VECTOR_SIZE)); +} + +void benchmark_all(benchmark::Benchmark& benchmark) { + double* dbl_arr; + uint16_t* exc_arr; + uint16_t* pos_arr; + uint16_t* exc_c_arr; + int64_t* ffor_arr; + int64_t* base_arr; + int64_t* encoded_arr; + double* dec_dbl_arr; + double* rg_smp_arr; + uint64_t* ffor_right_arr; + uint16_t* ffor_left_arr; + uint64_t* right_arr; + uint16_t* left_arr; + uint64_t* unffor_right_arr; + uint16_t* unffor_left_arr; + double* glue_arr; + + uint8_t bw; + + dbl_arr = new (std::align_val_t {64}) double[VECTOR_SIZE]; + exc_arr = new (std::align_val_t {64}) uint16_t[VECTOR_SIZE]; + pos_arr = new (std::align_val_t {64}) uint16_t[VECTOR_SIZE]; + encoded_arr = new (std::align_val_t {64}) int64_t[VECTOR_SIZE]; + dec_dbl_arr = new (std::align_val_t {64}) double[VECTOR_SIZE]; + exc_c_arr = new (std::align_val_t {64}) uint16_t[VECTOR_SIZE]; + ffor_arr = new (std::align_val_t {64}) int64_t[VECTOR_SIZE]; + base_arr = new (std::align_val_t {64}) int64_t[VECTOR_SIZE]; + rg_smp_arr = new (std::align_val_t {64}) double[VECTOR_SIZE]; + right_arr = new (std::align_val_t {64}) uint64_t[VECTOR_SIZE]; + left_arr = new (std::align_val_t {64}) uint16_t[VECTOR_SIZE]; + ffor_right_arr = new (std::align_val_t {64}) uint64_t[VECTOR_SIZE]; + ffor_left_arr = new (std::align_val_t {64}) uint16_t[VECTOR_SIZE]; + unffor_right_arr = new (std::align_val_t {64}) uint64_t[VECTOR_SIZE]; + unffor_left_arr = new (std::align_val_t {64}) uint16_t[VECTOR_SIZE]; + glue_arr = new (std::align_val_t {64}) double[VECTOR_SIZE]; + + for (auto& dataset : alp_bench::alp_dataset) { + std::ifstream ifile(dataset.sample_csv_file_path, std::ios::in); + if (!dataset.suitable_for_cutting) { continue; } + if (dataset.name.find("bw") != std::string::npos) { continue; } + + // check to see that the file was opened correctly: + if (!ifile.is_open()) { + std::cerr << "There was a problem opening the input file!\n"; + exit(1); // exit or do additional error checking + } + + double num = 0.0; + // keep storing values from the text file so long as data exists: + size_t c {0}; + while (ifile >> num) { + dbl_arr[c] = num; + c += 1; + } + + size_t n_values = VECTOR_SIZE; + + size_t global_c {0}; + alp::state stt; + + alp::AlpRD::init(dbl_arr, global_c, n_values, rg_smp_arr, stt); + + // benchmark alp encode + benchmark.Run(b_a_e(dbl_arr, + exc_arr, + pos_arr, + exc_c_arr, + encoded_arr, + bw, + ffor_arr, + base_arr, + stt, + dataset, + ffor_right_arr, + ffor_left_arr, + right_arr, + left_arr, + unffor_right_arr, + unffor_left_arr)); + + // decode + unffor::unffor(ffor_right_arr, unffor_right_arr, stt.right_bit_width, &stt.right_for_base); + unffor::unffor(ffor_left_arr, unffor_left_arr, stt.left_bit_width, &stt.left_for_base); + alp::AlpRD::decode(glue_arr, unffor_right_arr, unffor_left_arr, exc_arr, pos_arr, exc_c_arr, stt); + + // Validate + for (size_t j = 0; j < VECTOR_SIZE; ++j) { + auto l = dbl_arr[j]; + auto r = glue_arr[j]; + if (l != r) { + std::cerr << j << ", " << global_c << ", " << dataset.name << "\n"; + std::exit(-1); + } + } + + auto exceptions_count = exc_c_arr[0]; + if (dataset.exceptions_count != exceptions_count) { + std::cout << dataset.name << " with exceptions_count : " << dataset.exceptions_count << " should be " + << exceptions_count << "\n"; + } + + if (dataset.bit_width != bw) { + std::cout << dataset.name << " with bw " << static_cast(dataset.bit_width) << " should be " + << static_cast(bw) << "\n"; + } + } +} + +int main() { + benchmark::Benchmark benchmark = + benchmark::create("alp_encode_cutter") + .save() + .at(std::string(SOURCE_DIR) + "/alp_pub/results/" + benchmark::CmakeInfo::getCmakeToolchainFile()) + .print() + .add_extra_info(benchmark::CmakeInfo::getCmakeInfo()); + benchmark_all(benchmark); +} diff --git a/benchmarks/bench_speed/bench_alp_encode.cpp b/benchmarks/bench_speed/bench_alp_encode.cpp new file mode 100644 index 0000000..7774a5b --- /dev/null +++ b/benchmarks/bench_speed/bench_alp_encode.cpp @@ -0,0 +1,132 @@ +#include "alp.hpp" +#include "bench_alp.hpp" +#include "data.hpp" + +/* Bench ALP encode. */ +static __attribute__((noinline)) benchmark::BenchmarkReporter::Run b_a_e(const double* dbl_arr, + double* exc_arr, + uint16_t* pos_arr, + uint16_t* exc_c_arr, + int64_t* encoded_arr, + uint8_t& bw, + int64_t* ffor_arr, + int64_t* base_arr, + alp::state& stt, + alp_bench::Column& dataset) { + + int benchmark_number = dataset.id; + +#ifdef NDEBUG + uint64_t iterations = 300000; +#else + uint64_t iterations = 1; +#endif + + std::string benchmark_name = dataset.name + "_encode"; + + uint64_t cycles = benchmark::cycleclock::Now(); + for (uint64_t i = 0; i < iterations; ++i) { + alp::AlpEncode::encode(dbl_arr, exc_arr, pos_arr, exc_c_arr, encoded_arr, stt); + alp::AlpEncode::analyze_ffor(encoded_arr, bw, base_arr); + ffor::ffor(encoded_arr, ffor_arr, bw, base_arr); + } + + cycles = benchmark::cycleclock::Now() - cycles; + + return benchmark::BenchmarkReporter::Run( + benchmark_number, benchmark_name, iterations, double(cycles) / (double(iterations) * 1024)); +} + +void benchmark_all(benchmark::Benchmark& benchmark) { + double* dbl_arr; + double* exc_arr; + uint16_t* pos_arr; + uint16_t* exc_c_arr; + int64_t* ffor_arr; + int64_t* base_arr; + int64_t* encoded_arr; + double* dec_dbl_arr; + double* rg_smp_arr; + + uint8_t bw; + + dbl_arr = new (std::align_val_t {64}) double[1024]; + exc_arr = new (std::align_val_t {64}) double[1024]; + pos_arr = new (std::align_val_t {64}) uint16_t[1024]; + encoded_arr = new (std::align_val_t {64}) int64_t[1024]; + dec_dbl_arr = new (std::align_val_t {64}) double[1024]; + exc_c_arr = new (std::align_val_t {64}) uint16_t[1024]; + ffor_arr = new (std::align_val_t {64}) int64_t[1024]; + base_arr = new (std::align_val_t {64}) int64_t[1024]; + rg_smp_arr = new (std::align_val_t {64}) double[1024]; + + for (auto& dataset : alp_bench::alp_dataset) { + std::ifstream ifile(dataset.sample_csv_file_path, std::ios::in); + if (dataset.suitable_for_cutting) { continue; } + if (dataset.name.find("bw") != std::string::npos) { continue; } + + // check to see that the file was opened correctly: + if (!ifile.is_open()) { + std::cerr << "There was a problem opening the input file!\n"; + exit(1); // exit or do additional error checking + } + + double num = 0.0; + // keep storing values from the text file so long as data exists: + size_t c {0}; + while (ifile >> num) { + dbl_arr[c] = num; + c += 1; + } + + size_t n_values = 1024; + + size_t global_c {0}; + alp::state stt; + alp::AlpEncode::init(dbl_arr, global_c, n_values, rg_smp_arr, stt); // 32 runs of 1 value + + // benchmark alp encode + benchmark.Run(b_a_e(dbl_arr, exc_arr, pos_arr, exc_c_arr, encoded_arr, bw, ffor_arr, base_arr, stt, dataset)); + + // decode + generated::falp::fallback::scalar::falp(reinterpret_cast(ffor_arr), + dec_dbl_arr, + bw, + reinterpret_cast(base_arr), + stt.fac, + stt.exp); + + alp::AlpDecode::patch_exceptions(dec_dbl_arr, exc_arr, pos_arr, exc_c_arr); + + // Validate + for (size_t j = 0; j < alp::config::VECTOR_SIZE; ++j) { + auto l = dbl_arr[j]; + auto r = dec_dbl_arr[j]; + if (l != r) { + std::cerr << j << ", " << global_c << ", " << dataset.name << "\n"; + std::exit(-1); + } + } + + auto exceptions_count = exc_c_arr[0]; + if (dataset.exceptions_count != exceptions_count) { + std::cout << dataset.name << " with exceptions_count : " << dataset.exceptions_count << " should be " + << exceptions_count << "\n"; + } + + if (dataset.bit_width != bw) { + std::cout << dataset.name << " with bw " << static_cast(dataset.bit_width) << " should be " + << static_cast(bw) << "\n"; + } + } +} + +int main() { + benchmark::Benchmark benchmark = + benchmark::create("alp_encode_pde") + .save() + .at(std::string(SOURCE_DIR) + "/alp_pub/results/" + benchmark::CmakeInfo::getCmakeToolchainFile()) + .print() + .add_extra_info(benchmark::CmakeInfo::getCmakeInfo()); + benchmark_all(benchmark); +} diff --git a/benchmarks/bench_speed/bench_alp_without_sampling.cpp b/benchmarks/bench_speed/bench_alp_without_sampling.cpp new file mode 100644 index 0000000..1d6b2b4 --- /dev/null +++ b/benchmarks/bench_speed/bench_alp_without_sampling.cpp @@ -0,0 +1,177 @@ +#include "alp.hpp" +#include "bench_alp.hpp" +#include "data.hpp" + +using namespace alp::config; + +static __attribute__((noinline)) benchmark::BenchmarkReporter::Run bench_alp_encode(const double* dbl_arr, + double* exc_arr, + uint16_t* pos_arr, + uint16_t* exc_c_arr, + uint64_t* bitmap, + int64_t* encoded_arr, + uint8_t& bw, + int64_t* ffor_arr, + int64_t* base_arr, + alp::state& stt, + alp_bench::Column& dataset) { + + int benchmark_number = dataset.id; + +#ifdef NDEBUG + uint64_t iterations = 300000; +#else + uint64_t iterations = 1; +#endif + + std::string benchmark_name = dataset.name + "_encode"; + + uint64_t cycles = benchmark::cycleclock::Now(); + for (uint64_t i = 0; i < iterations; ++i) { + alp::AlpEncode::encode(dbl_arr, exc_arr, pos_arr, exc_c_arr, encoded_arr, stt); + alp::AlpEncode::analyze_ffor(encoded_arr, bw, base_arr); + ffor::ffor(encoded_arr, ffor_arr, bw, base_arr); + } + + cycles = benchmark::cycleclock::Now() - cycles; + + return benchmark::BenchmarkReporter::Run( + benchmark_number, benchmark_name, iterations, double(cycles) / (double(iterations) * 1024)); +} + +static __attribute__((noinline)) benchmark::BenchmarkReporter::Run bench_alp_encode_simdized(const double* dbl_arr, + double* exc_arr, + uint16_t* pos_arr, + uint16_t* exc_c_arr, + int64_t* encoded_arr, + uint8_t fac, + uint8_t exp, + alp_bench::Column& dataset, + uint8_t& bw, + int64_t* base_arr, + int64_t* ffor_arr) { + + int benchmark_number = dataset.id; + +#ifdef NDEBUG + uint64_t iterations = 300000; +#else + uint64_t iterations = 1; +#endif + + std::string benchmark_name = dataset.name + "_encode_simdized"; + + uint64_t cycles = benchmark::cycleclock::Now(); + for (uint64_t i = 0; i < iterations; ++i) { + alp::AlpEncode::encode_simdized(dbl_arr, exc_arr, pos_arr, exc_c_arr, encoded_arr, fac, exp); + alp::AlpEncode::analyze_ffor(encoded_arr, bw, base_arr); + ffor::ffor(encoded_arr, ffor_arr, bw, base_arr); + } + + cycles = benchmark::cycleclock::Now() - cycles; + + return benchmark::BenchmarkReporter::Run( + benchmark_number, benchmark_name, iterations, double(cycles) / (double(iterations) * 1024)); +} + +void benchmark_all(benchmark::Benchmark& benchmark) { + double* dbl_arr; + double* exc_arr; + uint16_t* pos_arr; + uint16_t* exc_c_arr; + int64_t* ffor_arr; + int64_t* base_arr; + int64_t* encoded_arr; + double* dec_dbl_arr; + double* rg_smp_arr; + + uint8_t bw; + + uint16_t n_comb = 5; // Maximum number of combinations obtained from row group sampling + + dbl_arr = new (std::align_val_t {64}) double[1024]; + exc_arr = new (std::align_val_t {64}) double[1024]; + pos_arr = new (std::align_val_t {64}) uint16_t[1024]; + encoded_arr = new (std::align_val_t {64}) int64_t[1024]; + dec_dbl_arr = new (std::align_val_t {64}) double[1024]; + exc_c_arr = new (std::align_val_t {64}) uint16_t[1024]; + ffor_arr = new (std::align_val_t {64}) int64_t[1024]; + base_arr = new (std::align_val_t {64}) int64_t[1024]; + rg_smp_arr = new (std::align_val_t {64}) double[1024]; + + for (auto& dataset : alp_bench::alp_dataset) { + std::ifstream ifile(dataset.sample_csv_file_path, std::ios::in); + if (dataset.suitable_for_cutting) { continue; } + if (dataset.name.find("bw") != std::string::npos) { continue; } + + // check to see that the file was opened correctly: + if (!ifile.is_open()) { + std::cerr << "There was a problem opening the input file!\n"; + exit(1); // exit or do additional error checking + } + + double num = 0.0; + // keep storing values from the text file so long as data exists: + size_t c {0}; + while (ifile >> num) { + dbl_arr[c] = num; + c += 1; + } + + size_t global_c {0}; + alp::state stt; + + benchmark.Run(bench_alp_encode_simdized(dbl_arr, + exc_arr, + pos_arr, + exc_c_arr, + encoded_arr, + dataset.factor, + dataset.exponent, + dataset, + bw, + base_arr, + ffor_arr)); + + // decode + generated::falp::fallback::scalar::falp(reinterpret_cast(ffor_arr), + dec_dbl_arr, + bw, + reinterpret_cast(base_arr), + dataset.factor, + dataset.exponent); + + alp::AlpDecode::patch_exceptions(dec_dbl_arr, exc_arr, pos_arr, exc_c_arr); + + // Validate + for (size_t j = 0; j < VECTOR_SIZE; ++j) { + auto l = dbl_arr[j]; + auto r = dec_dbl_arr[j]; + if (l != r) { + std::cerr << j << ", " << global_c << ", " << dataset.name << "\n"; + std::exit(-1); + } + } + + auto exceptions_count = exc_c_arr[0]; + if (dataset.exceptions_count != exceptions_count) { + std::cout << dataset.name << " with exceptions_count : " << dataset.exceptions_count << " should be " + << exceptions_count << "\n"; + } + + if (dataset.bit_width != bw) { + std::cout << dataset.name << " with bw " << static_cast(dataset.bit_width) << " should be " + << static_cast(bw) << "\n"; + } + } +} + +int main() { + benchmark::Benchmark benchmark = + benchmark::create("alp_encode_without_sampling") + .save() + .at(std::string(SOURCE_DIR) + "/alp_pub/results/" + benchmark::CmakeInfo::getCmakeToolchainFile()) + .print() + .add_extra_info(benchmark::CmakeInfo::getCmakeInfo()); + benchmark_all(benchmark); +} diff --git a/benchmarks/bench_speed/bench_chimp.cpp b/benchmarks/bench_speed/bench_chimp.cpp new file mode 100644 index 0000000..d1aceae --- /dev/null +++ b/benchmarks/bench_speed/bench_chimp.cpp @@ -0,0 +1,209 @@ +#include "bench_chimp.hpp" +#include "chimp/chimp.hpp" +#include "data.hpp" + +static __attribute__((noinline)) benchmark::BenchmarkReporter::Run +bench_decode_chimp(alp_bench::Column& dataset, + idx_t leading_zero_block_size, + uint32_t leading_zero_index, + alp_bench::ChimpDecompressionState chimp_de_state, + alp_bench::FlagBuffer flag_buffer, + alp_bench::LeadingZeroBuffer leading_zero_buffer, + alp_bench::ChimpConstants::Flags* flags, + uint8_t* leading_zero_unpacked, + uint64_t* dec_arr, + uint8_t* flags_arr, + uint8_t* data_arr, + uint8_t* leading_zero_arr, + uint8_t leading_zero_block_count, + alp_bench::ChimpCompressionState state) { + + int benchmark_number = dataset.id; + +#ifdef NDEBUG + uint64_t iterations = 300000; +#else + uint64_t iterations = 1; +#endif + + std::string benchmark_name = dataset.name + "_decode"; + + uint64_t cycles = benchmark::cycleclock::Now(); + for (uint64_t j = 0; j < iterations; ++j) { + flags[0] = alp_bench::ChimpConstants::Flags::VALUE_IDENTICAL; // First value doesn't require a flag + for (idx_t i = 0; i < 1023; i++) { + flags[1 + i] = (alp_bench::ChimpConstants::Flags)flag_buffer.Extract(); + } + + for (idx_t i = 0; i < leading_zero_block_size; i++) { + leading_zero_unpacked[i] = + alp_bench::ChimpConstants::Decompression::LEADING_REPRESENTATION[leading_zero_buffer.Extract()]; + } + + for (idx_t i = 0; i < 1024; i++) { + dec_arr[i] = alp_bench::ChimpDecompression::Load( + flags[i], leading_zero_unpacked, leading_zero_index, chimp_de_state); + } + chimp_de_state.Reset(); + chimp_de_state.input.SetStream(data_arr); + flag_buffer.SetBuffer(flags_arr); + leading_zero_buffer.SetBuffer(leading_zero_arr); + leading_zero_block_count = state.leading_zero_buffer.BlockCount(); + leading_zero_block_size = static_cast(leading_zero_block_count) * 8; + leading_zero_index = 0; + } + + cycles = benchmark::cycleclock::Now() - cycles; + + return benchmark::BenchmarkReporter::Run( + benchmark_number, benchmark_name, iterations, double(cycles) / (double(iterations) * 1024)); +} + +static __attribute__((noinline)) benchmark::BenchmarkReporter::Run +bench_encode_chimp(alp_bench::Column& dataset, + alp_bench::ChimpCompressionState state, + uint8_t* data_arr, + uint8_t* flags_arr, + uint8_t* leading_zero_arr, + uint64_t* uint64_p, + double* dbl_arr) { + + int benchmark_number = dataset.id; + +#ifdef NDEBUG + uint64_t iterations = 300000; +#else + uint64_t iterations = 1; +#endif + + std::string benchmark_name = dataset.name + "_encode"; + + uint64_t cycles = benchmark::cycleclock::Now(); + for (uint64_t j = 0; j < iterations; ++j) { + // Init Encoding + state.Reset(); + state.output.SetStream(data_arr); + state.leading_zero_buffer.SetBuffer(leading_zero_arr); + state.flag_buffer.SetBuffer(flags_arr); + + /* + * + * Encode + * + */ + uint64_p = reinterpret_cast(dbl_arr); + for (size_t i {0}; i < 1024; ++i) { + alp_bench::ChimpCompression::Store(uint64_p[i], state); + } + + state.Flush(); + state.output.Flush(); + } + + cycles = benchmark::cycleclock::Now() - cycles; + + return benchmark::BenchmarkReporter::Run( + benchmark_number, benchmark_name, iterations, double(cycles) / (double(iterations) * 1024)); +} + +void benchmark_all(benchmark::Benchmark& benchmark) { + uint8_t* data_arr; + uint8_t* flags_arr; + uint8_t* leading_zero_arr; + double* dbl_arr; + uint64_t* dec_arr; + uint64_t* uint64_p; + alp_bench::ChimpCompressionState state; + alp_bench::ChimpConstants::Flags* flags; + uint8_t* leading_zero_unpacked; + alp_bench::FlagBuffer flag_buffer; + alp_bench::LeadingZeroBuffer leading_zero_buffer; + alp_bench::ChimpDecompressionState chimp_de_state; + uint32_t leading_zero_index; + uint8_t leading_zero_block_count; + idx_t leading_zero_block_size; + + dbl_arr = new (std::align_val_t {64}) double[1024]; + data_arr = new (std::align_val_t {64}) uint8_t[8096]; + flags_arr = new (std::align_val_t {64}) uint8_t[1025]; + leading_zero_arr = new (std::align_val_t {64}) uint8_t[1024]; + dec_arr = new (std::align_val_t {64}) uint64_t[1024]; + flags = new (std::align_val_t {64}) alp_bench::ChimpConstants::Flags[1024]; + leading_zero_unpacked = new (std::align_val_t {64}) uint8_t[1024]; + + for (auto& dataset : alp_bench::alp_dataset) { + std::ifstream ifile(dataset.sample_csv_file_path, std::ios::in); + + // check to see that the file was opened correctly: + if (!ifile.is_open()) { + std::cerr << "There was a problem opening the input file!\n"; + exit(1); // exit or do additional error checking + } + + double num = 0.0; + // keep storing values from the text file so long as data exists: + size_t c {0}; + while (ifile >> num) { + dbl_arr[c] = num; + c += 1; + } + + // Benchmark decoding + benchmark.Run(bench_encode_chimp(dataset, state, data_arr, flags_arr, leading_zero_arr, uint64_p, dbl_arr)); + + // Init Encoding + state.Reset(); + state.output.SetStream(data_arr); + state.leading_zero_buffer.SetBuffer(leading_zero_arr); + state.flag_buffer.SetBuffer(flags_arr); + + /* + * + * Encode + * + */ + uint64_p = reinterpret_cast(dbl_arr); + for (size_t i {0}; i < 1024; ++i) { + alp_bench::ChimpCompression::Store(uint64_p[i], state); + } + + state.Flush(); + state.output.Flush(); + + // Init decoding + leading_zero_block_count = state.leading_zero_buffer.BlockCount(); + leading_zero_block_size = static_cast(leading_zero_block_count) * 8; + leading_zero_index = 0; + chimp_de_state.input.SetStream(data_arr); + flag_buffer.SetBuffer(flags_arr); + leading_zero_buffer.SetBuffer(leading_zero_arr); + + // Benchmark decoding + benchmark.Run(bench_decode_chimp(dataset, + leading_zero_block_size, + leading_zero_index, + chimp_de_state, + flag_buffer, + leading_zero_buffer, + flags, + leading_zero_unpacked, + dec_arr, + flags_arr, + data_arr, + leading_zero_arr, + leading_zero_block_count, + state)); + + ifile.close(); + } +} + +int main() { + benchmark::Benchmark benchmark = + benchmark::create("chimp") + .save() + .at(std::string(SOURCE_DIR) + "/alp_pub/results/" + benchmark::CmakeInfo::getCmakeToolchainFile()) + .print() + .add_extra_info(benchmark::CmakeInfo::getCmakeInfo()); + benchmark_all(benchmark); +} diff --git a/benchmarks/bench_speed/bench_chimp128.cpp b/benchmarks/bench_speed/bench_chimp128.cpp new file mode 100644 index 0000000..cd4d563 --- /dev/null +++ b/benchmarks/bench_speed/bench_chimp128.cpp @@ -0,0 +1,307 @@ +#include "bench_chimp128.hpp" +#include "chimp/chimp128.hpp" +#include "data.hpp" + +static __attribute__((noinline)) benchmark::BenchmarkReporter::Run +bench_decode_chimp128(const double* dbl_arr, + alp_bench::Column& dataset, + uint8_t leading_zero_block_count, + alp_bench::Chimp128CompressionState com_stt, + idx_t leading_zero_block_size, + uint32_t unpacked_index, + uint32_t leading_zero_index, + alp_bench::Chimp128DecompressionState chimp_de_state, + uint8_t* data_arr, + alp_bench::FlagBuffer flag_buffer, + uint8_t* flags_arr, + alp_bench::LeadingZeroBuffer leading_zero_buffer, + uint8_t* leading_zero_arr, + alp_bench::ChimpConstants::Flags* flags, + uint16_t* packed_data_arr, + uint8_t* leading_zero_unpacked, + alp_bench::UnpackedData* unpacked_data_arr, + uint64_t* dec_arr) { + + int benchmark_number = dataset.id; + +#ifdef NDEBUG + uint64_t iterations = 300000; +#else + uint64_t iterations = 1; +#endif + + std::string benchmark_name = dataset.name + "_decode"; + + uint64_t cycles = benchmark::cycleclock::Now(); + for (uint64_t j = 0; j < iterations; ++j) { + // Init decoding + leading_zero_block_count = com_stt.leading_zero_buffer.BlockCount(); + leading_zero_block_size = static_cast(leading_zero_block_count) * 8; + unpacked_index = 0; + leading_zero_index = 0; + chimp_de_state.input.SetStream(data_arr); + flag_buffer.SetBuffer(flags_arr); + leading_zero_buffer.SetBuffer(leading_zero_arr); + + /* + * + * DECODE + * + */ + + // Decode flags + flags[0] = alp_bench::ChimpConstants::Flags::VALUE_IDENTICAL; // First value doesn't require a flag + for (idx_t i = 0; i < 1023; i++) { + flags[1 + i] = (alp_bench::ChimpConstants::Flags)flag_buffer.Extract(); + } + + // Decode leading zero + for (idx_t i = 0; i < leading_zero_block_size; i++) { + leading_zero_unpacked[i] = + alp_bench::ChimpConstants::Decompression::LEADING_REPRESENTATION[leading_zero_buffer.Extract()]; + } + + /* + * count how many cases of 'TRAILING_EXCEEDS_THRESHOLD' are based on the flags + * that is the exact number of packed data blocks + * that is the case in which in Chimp128 they save data in a block of 16 bits + */ + idx_t packed_data_block_count = 0; + for (idx_t i = 0; i < 1024; i++) { + packed_data_block_count += flags[1 + i] == alp_bench::ChimpConstants::Flags::TRAILING_EXCEEDS_THRESHOLD; + } + + for (idx_t i = 0; i < packed_data_block_count; i++) { + alp_bench::PackedDataUtils::Unpack(((uint16_t*)packed_data_arr)[i], unpacked_data_arr[i]); + if (unpacked_data_arr[i].significant_bits == 0) { unpacked_data_arr[i].significant_bits = 64; } + unpacked_data_arr[i].leading_zero = + alp_bench::ChimpConstants::Decompression::LEADING_REPRESENTATION[unpacked_data_arr[i].leading_zero]; + } + + chimp_de_state.Reset(); + + for (idx_t i = 0; i < 1024; i++) { + dec_arr[i] = alp_bench::Chimp128Decompression::Load( + flags[i], leading_zero_unpacked, leading_zero_index, unpacked_data_arr, unpacked_index, chimp_de_state); + } + } + + cycles = benchmark::cycleclock::Now() - cycles; + + return benchmark::BenchmarkReporter::Run( + benchmark_number, benchmark_name, iterations, double(cycles) / (double(iterations) * 1024)); +} + +static __attribute__((noinline)) benchmark::BenchmarkReporter::Run +bench_encode_chimp128(alp_bench::Column& dataset, + alp_bench::Chimp128CompressionState com_stt, + uint8_t leading_zero_block_count, + idx_t leading_zero_block_size, + uint32_t unpacked_index, + uint32_t leading_zero_index, + alp_bench::Chimp128DecompressionState chimp_de_state, + uint8_t* data_arr, + alp_bench::FlagBuffer flag_buffer, + uint8_t* flags_arr, + alp_bench::LeadingZeroBuffer leading_zero_buffer, + uint8_t* leading_zero_arr, + alp_bench::ChimpConstants::Flags* flags, + uint16_t* packed_data_arr, + uint8_t* leading_zero_unpacked, + alp_bench::UnpackedData* unpacked_data_arr, + uint64_t* dec_arr) { + + int benchmark_number = dataset.id; + +#ifdef NDEBUG + uint64_t iterations = 300000; +#else + uint64_t iterations = 1; +#endif + + std::string benchmark_name = dataset.name + "_encode"; + + uint64_t cycles = benchmark::cycleclock::Now(); + for (uint64_t j = 0; j < iterations; ++j) { + // Init decoding + leading_zero_block_count = com_stt.leading_zero_buffer.BlockCount(); + leading_zero_block_size = static_cast(leading_zero_block_count) * 8; + unpacked_index = 0; + leading_zero_index = 0; + chimp_de_state.input.SetStream(data_arr); + flag_buffer.SetBuffer(flags_arr); + leading_zero_buffer.SetBuffer(leading_zero_arr); + + /* + * + * DECODE + * + */ + + // Decode flags + flags[0] = alp_bench::ChimpConstants::Flags::VALUE_IDENTICAL; // First value doesn't require a flag + for (idx_t i = 0; i < 1023; i++) { + flags[1 + i] = (alp_bench::ChimpConstants::Flags)flag_buffer.Extract(); + } + + // Decode leading zero + for (idx_t i = 0; i < leading_zero_block_size; i++) { + leading_zero_unpacked[i] = + alp_bench::ChimpConstants::Decompression::LEADING_REPRESENTATION[leading_zero_buffer.Extract()]; + } + + /* + * count how many cases of 'TRAILING_EXCEEDS_THRESHOLD' are based on the flags + * that is the exact number of packed data blocks + * that is the case in which in Chimp128 they save data in a block of 16 bits + */ + idx_t packed_data_block_count = 0; + for (idx_t i = 0; i < 1024; i++) { + packed_data_block_count += flags[1 + i] == alp_bench::ChimpConstants::Flags::TRAILING_EXCEEDS_THRESHOLD; + } + + for (idx_t i = 0; i < packed_data_block_count; i++) { + alp_bench::PackedDataUtils::Unpack(((uint16_t*)packed_data_arr)[i], unpacked_data_arr[i]); + if (unpacked_data_arr[i].significant_bits == 0) { unpacked_data_arr[i].significant_bits = 64; } + unpacked_data_arr[i].leading_zero = + alp_bench::ChimpConstants::Decompression::LEADING_REPRESENTATION[unpacked_data_arr[i].leading_zero]; + } + + chimp_de_state.Reset(); + + for (idx_t i = 0; i < 1024; i++) { + dec_arr[i] = alp_bench::Chimp128Decompression::Load( + flags[i], leading_zero_unpacked, leading_zero_index, unpacked_data_arr, unpacked_index, chimp_de_state); + } + } + + cycles = benchmark::cycleclock::Now() - cycles; + + return benchmark::BenchmarkReporter::Run( + benchmark_number, benchmark_name, iterations, double(cycles) / (double(iterations) * 1024)); +} + +void benchmark_all(benchmark::Benchmark& benchmark) { + uint8_t* data_arr; + uint8_t* flags_arr; + uint8_t* leading_zero_arr; + uint16_t* packed_data_arr; + double* dbl_arr; + double* dec_dbl_p; + uint64_t* dec_arr; + uint64_t* uint64_p; + + // Encode + alp_bench::Chimp128CompressionState com_stt; + uint8_t leading_zero_block_count; + + // Decode + idx_t leading_zero_block_size; + uint32_t unpacked_index; + uint32_t leading_zero_index; + alp_bench::FlagBuffer flag_buffer; + alp_bench::LeadingZeroBuffer leading_zero_buffer; + alp_bench::Chimp128DecompressionState chimp_de_state; + alp_bench::ChimpConstants::Flags* flags; + uint8_t* leading_zero_unpacked; + alp_bench::UnpackedData* unpacked_data_arr; + + dbl_arr = new (std::align_val_t {64}) double[1024]; + data_arr = new (std::align_val_t {64}) uint8_t[8096]; + flags_arr = new (std::align_val_t {64}) uint8_t[1025]; + leading_zero_arr = new (std::align_val_t {64}) uint8_t[1024]; + dec_arr = new (std::align_val_t {64}) uint64_t[1024]; + packed_data_arr = new (std::align_val_t {64}) uint16_t[1024]; + flags = new (std::align_val_t {64}) alp_bench::ChimpConstants::Flags[1025]; + leading_zero_unpacked = new (std::align_val_t {64}) uint8_t[1024]; + unpacked_data_arr = new (std::align_val_t {64}) alp_bench::UnpackedData[1024]; + + for (auto& dataset : alp_bench::alp_dataset) { + std::ifstream ifile(dataset.sample_csv_file_path, std::ios::in); + if (dataset.name.find("bw") != std::string::npos) { continue; } + + // check to see that the file was opened correctly: + if (!ifile.is_open()) { + std::cerr << "There was a problem opening the input file!\n"; + exit(1); // exit or do additional error checking + } + + double num = 0.0; + // keep storing values from the text file so long as data exists: + size_t c {0}; + while (ifile >> num) { + dbl_arr[c] = num; + c += 1; + } + + // Init Encoding + com_stt.Reset(); + com_stt.output.SetStream(data_arr); + com_stt.leading_zero_buffer.SetBuffer(leading_zero_arr); + com_stt.flag_buffer.SetBuffer(flags_arr); + com_stt.packed_data_buffer.SetBuffer(packed_data_arr); + + /* + * + * Encode + * + */ + uint64_p = reinterpret_cast(dbl_arr); + for (size_t i {0}; i < 1024; ++i) { + alp_bench::Chimp128Compression::Store(uint64_p[i], com_stt); + } + com_stt.Flush(); + com_stt.output.Flush(); + + // Benchmark encoding + benchmark.Run(bench_encode_chimp128(dataset, + com_stt, + leading_zero_block_count, + leading_zero_block_size, + unpacked_index, + leading_zero_index, + chimp_de_state, + data_arr, + flag_buffer, + flags_arr, + leading_zero_buffer, + leading_zero_arr, + flags, + packed_data_arr, + leading_zero_unpacked, + unpacked_data_arr, + dec_arr)); + + // Benchmark decoding + benchmark.Run(bench_decode_chimp128(dbl_arr, + dataset, + leading_zero_block_count, + com_stt, + leading_zero_block_size, + unpacked_index, + leading_zero_index, + chimp_de_state, + data_arr, + flag_buffer, + flags_arr, + leading_zero_buffer, + leading_zero_arr, + flags, + packed_data_arr, + leading_zero_unpacked, + unpacked_data_arr, + dec_arr)); + + ifile.close(); + } +} + +int main() { + benchmark::Benchmark benchmark = + benchmark::create("chimp128") + .save() + .at(std::string(SOURCE_DIR) + "/alp_pub/results/" + benchmark::CmakeInfo::getCmakeToolchainFile()) + .print() + .add_extra_info(benchmark::CmakeInfo::getCmakeInfo()); + benchmark_all(benchmark); +} diff --git a/benchmarks/bench_speed/bench_gorillas.cpp b/benchmarks/bench_speed/bench_gorillas.cpp new file mode 100644 index 0000000..29d2b91 --- /dev/null +++ b/benchmarks/bench_speed/bench_gorillas.cpp @@ -0,0 +1,167 @@ +#include "bench_gorillas.hpp" +#include "data.hpp" +#include "gorillas/gorillas.hpp" + +static __attribute__((noinline)) benchmark::BenchmarkReporter::Run +bench_decode_gorillas(alp_bench::Column& dataset, + alp_bench::GorillasDecompressionState gorillas_de_state, + alp_bench::FlagBuffer flag_buffer, + alp_bench::GorillasConstants::Flags* flags, + uint64_t* dec_arr, + uint8_t* flags_arr, + uint8_t* data_arr, + alp_bench::GorillasCompressionState state) { + + int benchmark_number = dataset.id; + +#ifdef NDEBUG + uint64_t iterations = 300000; +#else + uint64_t iterations = 1; +#endif + + std::string benchmark_name = dataset.name + "_decode"; + + uint64_t cycles = benchmark::cycleclock::Now(); + for (uint64_t j = 0; j < iterations; ++j) { + flags[0] = alp_bench::GorillasConstants::Flags::VALUE_IDENTICAL; // First value doesn't require a flag + for (idx_t i = 0; i < 1023; i++) { + flags[1 + i] = (alp_bench::GorillasConstants::Flags)flag_buffer.Extract(); + } + + for (idx_t i = 0; i < 1024; i++) { + dec_arr[i] = alp_bench::GorillasDecompression::Load(flags[i], gorillas_de_state); + } + gorillas_de_state.Reset(); + gorillas_de_state.input.SetStream(data_arr); + flag_buffer.SetBuffer(flags_arr); + } + + cycles = benchmark::cycleclock::Now() - cycles; + + return benchmark::BenchmarkReporter::Run( + benchmark_number, benchmark_name, iterations, double(cycles) / (double(iterations) * 1024)); +} + +static __attribute__((noinline)) benchmark::BenchmarkReporter::Run +bench_encode_gorillas(alp_bench::Column& dataset, + alp_bench::GorillasCompressionState state, + uint8_t* data_arr, + uint8_t* flags_arr, + uint64_t* uint64_p, + double* dbl_arr) { + + int benchmark_number = dataset.id; + +#ifdef NDEBUG + uint64_t iterations = 300000; +#else + uint64_t iterations = 1; +#endif + + std::string benchmark_name = dataset.name + "_encode"; + + uint64_t cycles = benchmark::cycleclock::Now(); + for (uint64_t j = 0; j < iterations; ++j) { + // Init Encoding + state.Reset(); + state.output.SetStream(data_arr); + state.flag_buffer.SetBuffer(flags_arr); + + /* + * + * Encode + * + */ + uint64_p = reinterpret_cast(dbl_arr); + for (size_t i {0}; i < 1024; ++i) { + alp_bench::GorillasCompression::Store(uint64_p[i], state); + } + + state.Flush(); + state.output.Flush(); + } + + cycles = benchmark::cycleclock::Now() - cycles; + + return benchmark::BenchmarkReporter::Run( + benchmark_number, benchmark_name, iterations, double(cycles) / (double(iterations) * 1024)); +} + +void benchmark_all(benchmark::Benchmark& benchmark) { + uint8_t* data_arr; + uint8_t* flags_arr; + double* dbl_arr; + uint64_t* dec_arr; + uint64_t* uint64_p; + alp_bench::GorillasCompressionState state; + alp_bench::GorillasConstants::Flags* flags; + alp_bench::FlagBuffer flag_buffer; + alp_bench::GorillasDecompressionState gorillas_de_state; + + dbl_arr = new (std::align_val_t {64}) double[1024]; + data_arr = new (std::align_val_t {64}) uint8_t[8192 + 1024]; + flags_arr = new (std::align_val_t {64}) uint8_t[1025]; + dec_arr = new (std::align_val_t {64}) uint64_t[1024]; + flags = new (std::align_val_t {64}) alp_bench::GorillasConstants::Flags[1024]; + + for (auto& dataset : alp_bench::alp_dataset) { + std::ifstream ifile(dataset.sample_csv_file_path, std::ios::in); + if (dataset.name.find("bw") != std::string::npos) { continue; } + + // check to see that the file was opened correctly: + if (!ifile.is_open()) { + std::cerr << "There was a problem opening the input file!\n"; + exit(1); // exit or do additional error checking + } + + double num = 0.0; + // keep storing values from the text file so long as data exists: + size_t c {0}; + while (ifile >> num) { + dbl_arr[c] = num; + c += 1; + } + + // Benchmark decoding + benchmark.Run(bench_encode_gorillas(dataset, state, data_arr, flags_arr, uint64_p, dbl_arr)); + + // Init Encoding + state.Reset(); + state.output.SetStream(data_arr); + state.flag_buffer.SetBuffer(flags_arr); + + /* + * + * Encode + * + */ + uint64_p = reinterpret_cast(dbl_arr); + for (size_t i {0}; i < 1024; ++i) { + alp_bench::GorillasCompression::Store(uint64_p[i], state); + } + + state.Flush(); + state.output.Flush(); + + // Init decoding + gorillas_de_state.input.SetStream(data_arr); + flag_buffer.SetBuffer(flags_arr); + + // Benchmark decoding + benchmark.Run( + bench_decode_gorillas(dataset, gorillas_de_state, flag_buffer, flags, dec_arr, flags_arr, data_arr, state)); + + ifile.close(); + } +} + +int main() { + benchmark::Benchmark benchmark = + benchmark::create("gorillas") + .save() + .at(std::string(SOURCE_DIR) + "/alp_pub/results/" + benchmark::CmakeInfo::getCmakeToolchainFile()) + .print() + .add_extra_info(benchmark::CmakeInfo::getCmakeInfo()); + benchmark_all(benchmark); +} diff --git a/benchmarks/bench_speed/bench_patas.cpp b/benchmarks/bench_speed/bench_patas.cpp new file mode 100644 index 0000000..22229e9 --- /dev/null +++ b/benchmarks/bench_speed/bench_patas.cpp @@ -0,0 +1,154 @@ +#include "bench_patas.hpp" +#include "data.hpp" +#include "patas/patas.hpp" + +static __attribute__((noinline)) benchmark::BenchmarkReporter::Run +bench_decoding_patas(alp_bench::Column& dataset, + uint16_t* packed_metadata, + uint8_t* data_arr, + uint64_t* dec_arr, + alp_bench::ByteReader byte_reader, + alp_bench::patas::PatasUnpackedValueStats* unpacked_data) { + + int benchmark_number = dataset.id; + +#ifdef NDEBUG + uint64_t iterations = 300000; +#else + uint64_t iterations = 1; +#endif + + std::string benchmark_name = dataset.name + "_decode"; + + uint64_t cycles = benchmark::cycleclock::Now(); + for (uint64_t j = 0; j < iterations; ++j) { + byte_reader.SetStream(data_arr); + + // UNPACKING METADATA (16 bits - 3 bytes) + for (idx_t i = 0; i < 1024; i++) { + alp_bench::PackedDataUtils::Unpack(((uint16_t*)packed_metadata)[i], + (alp_bench::UnpackedData&)unpacked_data[i]); + } + dec_arr[0] = (uint64_t)0; // Not sure why without this, it does not work on the > 2nd iteration... + for (idx_t i = 0; i < 1024; i++) { + dec_arr[i] = alp_bench::patas::PatasDecompression::DecompressValue( + byte_reader, + unpacked_data[i].significant_bytes, + unpacked_data[i].trailing_zeros, + dec_arr[i - unpacked_data[i].index_diff]); + } + } + + cycles = benchmark::cycleclock::Now() - cycles; + + return benchmark::BenchmarkReporter::Run( + benchmark_number, benchmark_name, iterations, double(cycles) / (double(iterations) * 1024)); +} + +static __attribute__((noinline)) benchmark::BenchmarkReporter::Run +bench_encoding_patas(alp_bench::Column& dataset, + alp_bench::patas::PatasCompressionState patas_state, + uint8_t* data_arr, + uint16_t* packed_metadata, + uint64_t* uint64_p, + double* dbl_arr) { + + int benchmark_number = dataset.id; + +#ifdef NDEBUG + uint64_t iterations = 300000; +#else + uint64_t iterations = 1; +#endif + + std::string benchmark_name = dataset.name + "_encode"; + + uint64_t cycles = benchmark::cycleclock::Now(); + for (uint64_t j = 0; j < iterations; ++j) { + patas_state.Reset(); + patas_state.SetOutputBuffer(data_arr); + patas_state.packed_data_buffer.SetBuffer(packed_metadata); + + /* + * Encode + */ + uint64_p = reinterpret_cast(dbl_arr); + for (size_t i {0}; i < 1024; ++i) { + alp_bench::patas::PatasCompression::Store(uint64_p[i], patas_state); + } + } + + cycles = benchmark::cycleclock::Now() - cycles; + + return benchmark::BenchmarkReporter::Run( + benchmark_number, benchmark_name, iterations, double(cycles) / (double(iterations) * 1024)); +} + +void benchmark_all(benchmark::Benchmark& benchmark) { + double* dbl_arr; + uint8_t* data_arr; + uint64_t* dec_arr; + uint64_t* uint64_p; + uint16_t* packed_metadata; + alp_bench::patas::PatasCompressionState patas_state; + alp_bench::patas::PatasUnpackedValueStats* unpacked_data; + alp_bench::ByteReader byte_reader; + + data_arr = new (std::align_val_t {64}) uint8_t[8096]; + dec_arr = new (std::align_val_t {64}) uint64_t[1024]; + packed_metadata = new (std::align_val_t {64}) uint16_t[1024]; + unpacked_data = new (std::align_val_t {64}) alp_bench::patas::PatasUnpackedValueStats[1024]; + dbl_arr = new (std::align_val_t {64}) double[1024]; + + for (auto& dataset : alp_bench::alp_dataset) { + std::ifstream ifile(dataset.sample_csv_file_path, std::ios::in); + + // check to see that the file was opened correctly: + if (!ifile.is_open()) { + std::cerr << "There was a problem opening the input file!\n"; + exit(1); // exit or do additional error checking + } + + double num = 0.0; + // keep storing values from the text file so long as data exists: + size_t c {0}; + while (ifile >> num) { + dbl_arr[c] = num; + c += 1; + } + + // Init Encoding + patas_state.Reset(); + patas_state.SetOutputBuffer(data_arr); + patas_state.packed_data_buffer.SetBuffer(packed_metadata); + + /* + * Encode + */ + uint64_p = reinterpret_cast(dbl_arr); + for (size_t i {0}; i < 1024; ++i) { + alp_bench::patas::PatasCompression::Store(uint64_p[i], patas_state); + } + + // Benchmark Encoding + benchmark.Run(bench_encoding_patas(dataset, patas_state, data_arr, packed_metadata, uint64_p, dbl_arr)); + + // Init decoding + byte_reader.SetStream(data_arr); + + // Benchmark decoding + benchmark.Run(bench_decoding_patas(dataset, packed_metadata, data_arr, dec_arr, byte_reader, unpacked_data)); + + ifile.close(); + } +} + +int main() { + benchmark::Benchmark benchmark = + benchmark::create("patas") + .save() + .at(std::string(SOURCE_DIR) + "/alp_pub/results/" + benchmark::CmakeInfo::getCmakeToolchainFile()) + .print() + .add_extra_info(benchmark::CmakeInfo::getCmakeInfo()); + benchmark_all(benchmark); +} diff --git a/benchmarks/bench_speed/bench_zstd.cpp b/benchmarks/bench_speed/bench_zstd.cpp new file mode 100644 index 0000000..944a449 --- /dev/null +++ b/benchmarks/bench_speed/bench_zstd.cpp @@ -0,0 +1,108 @@ +#include "bench_zstd.hpp" +#include "data.hpp" +#include "test/mapper.hpp" +#include "zstd.h" + +static __attribute__((noinline)) benchmark::BenchmarkReporter::Run +bench_decode_zstd(alp_bench::Column& dataset, void* enc_arr, size_t enc_size, void* dec_arr) { + + int benchmark_number = dataset.id; + +#ifdef NDEBUG + uint64_t iterations = 300000; +#else + uint64_t iterations = 1; +#endif + + std::string benchmark_name = dataset.name + "_decode"; + size_t DECODED_SIZE = 8 * 131072; + + uint64_t cycles = benchmark::cycleclock::Now(); + for (uint64_t j = 0; j < iterations; ++j) { + ZSTD_decompress(dec_arr, DECODED_SIZE, enc_arr, enc_size); + } + + cycles = benchmark::cycleclock::Now() - cycles; + + return benchmark::BenchmarkReporter::Run( + benchmark_number, benchmark_name, iterations, double(cycles) / (double(iterations) * 131072)); +} + +static __attribute__((noinline)) benchmark::BenchmarkReporter::Run +bench_encode_zstd(alp_bench::Column& dataset, double* dbl_arr, void* enc_arr) { + + int benchmark_number = dataset.id; + +#ifdef NDEBUG + uint64_t iterations = 300000; +#else + uint64_t iterations = 1; +#endif + + std::string benchmark_name = dataset.name + "_encode"; + + size_t ENC_SIZE_UPPER_BOUND = 8 * 131072; + size_t INPUT_SIZE = ENC_SIZE_UPPER_BOUND; + + uint64_t cycles = benchmark::cycleclock::Now(); + for (uint64_t j = 0; j < iterations; ++j) { + ZSTD_compress(enc_arr, ENC_SIZE_UPPER_BOUND, dbl_arr, INPUT_SIZE, 3); + } + + cycles = benchmark::cycleclock::Now() - cycles; + + return benchmark::BenchmarkReporter::Run( + benchmark_number, benchmark_name, iterations, double(cycles) / (double(iterations) * 131072)); +} + +void benchmark_all(benchmark::Benchmark& benchmark) { + double* dbl_arr; + void* enc_arr; + void* dec_arr; + size_t enc_size; + + dbl_arr = new (std::align_val_t {64}) double[131072]; + enc_arr = (void*)new (std::align_val_t {64}) double[131072]; + dec_arr = (void*)new (std::align_val_t {64}) double[131072]; + + for (auto& dataset : alp_bench::alp_dataset) { + + size_t tup_c; + + std::cout << dataset.binary_file_path << "\n"; + + const auto* col = mapper::mmap_file(tup_c, dataset.binary_file_path); + + double num = 0.0; + // keep storing values from the text file so long as data exists: + size_t c {0}; + for (size_t i = 0; i < tup_c; i++) { + num = col[i]; + dbl_arr[c] = num; + } + + // Benchmark encoding + benchmark.Run(bench_encode_zstd(dataset, dbl_arr, enc_arr)); + + // Init Encoding + size_t ENC_SIZE_UPPER_BOUND = 8 * 131072; + size_t INPUT_SIZE = ENC_SIZE_UPPER_BOUND; + + // To store ENC_SIZE + size_t const ENC_SIZE = ZSTD_compress(enc_arr, ENC_SIZE_UPPER_BOUND, dbl_arr, INPUT_SIZE, 3); + printf("%6u -> %7u\n", (unsigned)INPUT_SIZE, (unsigned)ENC_SIZE); + + // Benchmark decoding + benchmark.Run(bench_decode_zstd(dataset, enc_arr, ENC_SIZE, dec_arr)); + } +} + +int main() { + benchmark::Benchmark benchmark = + benchmark::create("zstd") + .save() + .at(std::string(SOURCE_DIR) + "/alp_pub/results/" + benchmark::CmakeInfo::getCmakeToolchainFile()) + .print() + .add_extra_info(benchmark::CmakeInfo::getCmakeInfo()); + benchmark_all(benchmark); +} diff --git a/benchmarks/fls_bench/LICENSE b/benchmarks/fls_bench/LICENSE new file mode 100644 index 0000000..d94ea38 --- /dev/null +++ b/benchmarks/fls_bench/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2023 Azim Afroozeh, CWI Database Architectures Group + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/benchmarks/fls_bench/fls_bench.hpp b/benchmarks/fls_bench/fls_bench.hpp new file mode 100644 index 0000000..69f8e2c --- /dev/null +++ b/benchmarks/fls_bench/fls_bench.hpp @@ -0,0 +1,2261 @@ +#ifndef FASTLANES_COMPRESSION_FLS_BENCH_FLS_BENCH_HPP +#define FASTLANES_COMPRESSION_FLS_BENCH_FLS_BENCH_HPP + +/* + * The M1 cycle counter is from Lemire repo. + * https://github.com/lemire/Code-used-on-Daniel-Lemire-s-blog/tree/master/2021/03/24 The other parts are from google + * benchmark repo, edited heavily. todo -> add the link + */ +#include +#include +#include +#include +#include +#include // for errno +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include // for memset +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include // for ioctl +#include +#include // for syscall +#include +#include + +#if defined(__linux__) +#include // for __NR_perf_event_open +#include // for perf event constants +#endif +/*---------------------------------------------------------------------------------------------------------------------\ + * Macros: +\---------------------------------------------------------------------------------------------------------------------*/ +// The _MSVC_LANG check should detect Visual Studio 2015 Update 3 and newer. +#if __cplusplus >= 201103L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201103L) +#define BENCHMARK_HAS_CXX11 +#endif + +// This _MSC_VER check should detect VS 2017 v15.3 and newer. +#if __cplusplus >= 201703L || (defined(_MSC_VER) && _MSC_VER >= 1911 && _MSVC_LANG >= 201703L) +#define BENCHMARK_HAS_CXX17 +#endif + +#if defined(BENCHMARK_HAS_CXX11) +#include +#include +#include +#endif + +#if defined(_MSC_VER) +#include // for _ReadWriteBarrier +#endif + +#ifndef BENCHMARK_HAS_CXX11 +#define BENCHMARK_DISALLOW_COPY_AND_ASSIGN(TypeName) \ + TypeName(const TypeName&); \ + TypeName& operator=(const TypeName&) +#else +#define BENCHMARK_DISALLOW_COPY_AND_ASSIGN(TypeName) \ + TypeName(const TypeName&) = delete; \ + TypeName& operator=(const TypeName&) = delete +#endif + +#ifdef BENCHMARK_HAS_CXX17 +#define BENCHMARK_UNUSED FLS_BENCH_MAYBE_UNUSED +#elif defined(__GNUC__) || defined(__clang__) +#define BENCHMARK_UNUSED __attribute__((unused)) +#else +#define BENCHMARK_UNUSED +#endif + +#if defined(__GNUC__) || defined(__clang__) +#define BENCHMARK_ALWAYS_INLINE __attribute__((always_inline)) +#define BENCHMARK_NOEXCEPT noexcept +#define BENCHMARK_NOEXCEPT_OP(x) noexcept(x) +#elif defined(_MSC_VER) && !defined(__clang__) +#define BENCHMARK_ALWAYS_INLINE __forceinline +#if _MSC_VER >= 1900 +#define BENCHMARK_NOEXCEPT noexcept +#define BENCHMARK_NOEXCEPT_OP(x) noexcept(x) +#else +#define BENCHMARK_NOEXCEPT +#define BENCHMARK_NOEXCEPT_OP(x) +#endif +#define __func__ __FUNCTION__ +#else +#define BENCHMARK_ALWAYS_INLINE +#define BENCHMARK_NOEXCEPT +#define BENCHMARK_NOEXCEPT_OP(x) +#endif + +#define BENCHMARK_INTERNAL_TOSTRING2(x) #x +#define BENCHMARK_INTERNAL_TOSTRING(x) BENCHMARK_INTERNAL_TOSTRING2(x) + +#if defined(__GNUC__) || defined(__clang__) +#define BENCHMARK_BUILTIN_EXPECT(x, y) __builtin_expect(x, y) +#define BENCHMARK_DEPRECATED_MSG(msg) __attribute__((deprecated(msg))) +#else +#define BENCHMARK_BUILTIN_EXPECT(x, y) x +#define BENCHMARK_DEPRECATED_MSG(msg) +#define BENCHMARK_WARNING_MSG(msg) \ + __pragma(message(__FILE__ "(" BENCHMARK_INTERNAL_TOSTRING(__LINE__) ") : warning note: " msg)) +#endif + +#if defined(__GNUC__) && !defined(__clang__) +#define BENCHMARK_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) +#endif + +#ifndef __has_builtin +#define __has_builtin(x) 0 +#endif + +#if defined(__GNUC__) || __has_builtin(__builtin_unreachable) +#define BENCHMARK_UNREACHABLE() __builtin_unreachable() +#elif defined(_MSC_VER) +#define BENCHMARK_UNREACHABLE() __assume(false) +#else +#define BENCHMARK_UNREACHABLE() ((void)0) +#endif + +#ifdef BENCHMARK_HAS_CXX11 +#else +#endif + +// clang-format off + +#ifndef __has_feature + #define __has_feature(x) 0 +#endif + +#if defined(__clang__) + #if defined(__ibmxl__) + #if !defined(COMPILER_IBMXL) + #define COMPILER_IBMXL + #endif + #elif !defined(COMPILER_CLANG) + #define COMPILER_CLANG + #endif +#elif defined(_MSC_VER) + #if !defined(COMPILER_MSVC) + #define COMPILER_MSVC + #endif +#elif defined(__GNUC__) + #if !defined(COMPILER_GCC) + #define COMPILER_GCC + #endif +#endif + +#if __has_feature(cxx_attributes) + #define BENCHMARK_NORETURN [[noreturn]] +#elif defined(__GNUC__) + #define BENCHMARK_NORETURN __attribute__((noreturn)) +#elif defined(COMPILER_MSVC) + #define BENCHMARK_NORETURN __declspec(noreturn) +#else + #define BENCHMARK_NORETURN +#endif + +#if defined(__CYGWIN__) + #define BENCHMARK_OS_CYGWIN 1 +#elif defined(_WIN32) + #define BENCHMARK_OS_WINDOWS 1 + #if defined(__MINGW32__) + #define BENCHMARK_OS_MINGW 1 + #endif +#elif defined(__APPLE__) + #define BENCHMARK_OS_APPLE 1 + #include "TargetConditionals.h" + #if defined(TARGET_OS_MAC) + #define BENCHMARK_OS_MACOSX 1 + #if defined(TARGET_OS_IPHONE) + #define BENCHMARK_OS_IOS 1 + #endif + #endif +#elif defined(__FreeBSD__) + #define BENCHMARK_OS_FREEBSD 1 +#elif defined(__NetBSD__) + #define BENCHMARK_OS_NETBSD 1 +#elif defined(__OpenBSD__) + #define BENCHMARK_OS_OPENBSD 1 +#elif defined(__DragonFly__) + #define BENCHMARK_OS_DRAGONFLY 1 +#elif defined(__linux__) + #define BENCHMARK_OS_LINUX 1 +#elif defined(__native_client__) + #define BENCHMARK_OS_NACL 1 +#elif defined(__EMSCRIPTEN__) + #define BENCHMARK_OS_EMSCRIPTEN 1 +#elif defined(__rtems__) + #define BENCHMARK_OS_RTEMS 1 +#elif defined(__Fuchsia__) + #define BENCHMARK_OS_FUCHSIA 1 +#elif defined (__SVR4) && defined (__sun) + #define BENCHMARK_OS_SOLARIS 1 +#elif defined(__QNX__) + #define BENCHMARK_OS_QNX 1 +#elif defined(__MVS__) + #define BENCHMARK_OS_ZOS 1 +#endif + +#if defined(__ANDROID__) && defined(__GLIBCXX__) + #define BENCHMARK_STL_ANDROID_GNUSTL 1 +#endif + +#if !__has_feature(cxx_exceptions) && !defined(__cpp_exceptions) \ + && !defined(__EXCEPTIONS) + #define BENCHMARK_HAS_NO_EXCEPTIONS +#endif + +#if defined(COMPILER_CLANG) || defined(COMPILER_GCC) + #define FLS_BENCH_MAYBE_UNUSED __attribute__((unused)) +#else + #define BENCHMARK_MAYBE_UNUSED +#endif + +// clang-format on + +#ifdef BENCHMARK_OS_WINDOWS +#include +#undef StrCat // Don't let StrCat in string_util.h be renamed to lstrcatA +#include +#include +#include +#else +#include +#ifndef BENCHMARK_OS_FUCHSIA +#include +#endif +#include +#include // this header must be included before 'sys/sysctl.h' to avoid compilation error on FreeBSD +#include +#if defined BENCHMARK_OS_FREEBSD || defined BENCHMARK_OS_MACOSX || defined BENCHMARK_OS_NETBSD || \ + defined BENCHMARK_OS_OPENBSD || defined BENCHMARK_OS_DRAGONFLY +#define BENCHMARK_HAS_SYSCTL +#include +#endif +#endif +#if defined(BENCHMARK_OS_SOLARIS) +#include +#endif +#if defined(BENCHMARK_OS_QNX) +#include +#endif + +#if defined(__GNUC__) || defined(__clang__) +#define BENCHMARK_ALWAYS_INLINE __attribute__((always_inline)) +#define BENCHMARK_NOEXCEPT noexcept +#define BENCHMARK_NOEXCEPT_OP(x) noexcept(x) +#elif defined(_MSC_VER) && !defined(__clang__) +#define BENCHMARK_ALWAYS_INLINE __forceinline +#if _MSC_VER >= 1900 +#define BENCHMARK_NOEXCEPT noexcept +#define BENCHMARK_NOEXCEPT_OP(x) noexcept(x) +#else +#define BENCHMARK_NOEXCEPT +#define BENCHMARK_NOEXCEPT_OP(x) +#endif +#define __func__ __FUNCTION__ +#else +#define BENCHMARK_ALWAYS_INLINE +#define BENCHMARK_NOEXCEPT +#define BENCHMARK_NOEXCEPT_OP(x) +#endif + +#if defined(BENCHMARK_OS_MACOSX) +#include +#endif +// For MSVC, we want to use '_asm rdtsc' when possible (since it works +// with even ancient MSVC compilers), and when not possible the +// __rdtsc intrinsic, declared in . Unfortunately, in some +// environments, and have conflicting +// declarations of some other intrinsics, breaking compilation. +// Therefore, we simply declare __rdtsc ourselves. See also +// http://connect.microsoft.com/VisualStudio/feedback/details/262047 +#if defined(COMPILER_MSVC) && !defined(_M_IX86) && !defined(_M_ARM64) +extern "C" uint64_t __rdtsc(); +#pragma intrinsic(__rdtsc) +#endif + +#if !defined(BENCHMARK_OS_WINDOWS) || defined(BENCHMARK_OS_MINGW) +#include +#include +#endif + +#ifdef BENCHMARK_OS_EMSCRIPTEN +#include +#endif + +#ifdef __aarch64__ +#define KPERF_LIST \ + /* ret, name, params */ \ + F(int, kpc_get_counting, void) \ + F(int, kpc_force_all_ctrs_set, int) \ + F(int, kpc_set_counting, uint32_t) \ + F(int, kpc_set_thread_counting, uint32_t) \ + F(int, kpc_set_config, uint32_t, void*) \ + F(int, kpc_get_config, uint32_t, void*) \ + F(int, kpc_set_period, uint32_t, void*) \ + F(int, kpc_get_period, uint32_t, void*) \ + F(uint32_t, kpc_get_counter_count, uint32_t) \ + F(uint32_t, kpc_get_config_count, uint32_t) \ + F(int, kperf_sample_get, int*) \ + F(int, kpc_get_thread_counters, int, unsigned int, void*) + +#define F(ret, name, ...) \ + typedef ret name##proc(__VA_ARGS__); \ + static name##proc* name; +KPERF_LIST +#undef F + +#define CFGWORD_EL0A32EN_MASK (0x10000) +#define CFGWORD_EL0A64EN_MASK (0x20000) +#define CFGWORD_EL1EN_MASK (0x40000) +#define CFGWORD_EL3EN_MASK (0x80000) +#define CFGWORD_ALLMODES_MASK (0xf0000) + +#define CPMU_NONE 0 +#define CPMU_CORE_CYCLE 0x02 +#define CPMU_INST_A64 0x8c +#define CPMU_INST_BRANCH 0x8d +#define CPMU_SYNC_DC_LOAD_MISS 0xbf +#define CPMU_SYNC_DC_STORE_MISS 0xc0 +#define CPMU_SYNC_DTLB_MISS 0xc1 +#define CPMU_SYNC_ST_HIT_YNGR_LD 0xc4 +#define CPMU_SYNC_BR_ANY_MISP 0xcb +#define CPMU_FED_IC_MISS_DEM 0xd3 +#define CPMU_FED_ITLB_MISS 0xd4 + +#define KPC_CLASS_FIXED (0) +#define KPC_CLASS_CONFIGURABLE (1) +#define KPC_CLASS_POWER (2) +#define KPC_CLASS_RAWPMU (3) +#define KPC_CLASS_FIXED_MASK (1u << KPC_CLASS_FIXED) +#define KPC_CLASS_CONFIGURABLE_MASK (1u << KPC_CLASS_CONFIGURABLE) +#define KPC_CLASS_POWER_MASK (1u << KPC_CLASS_POWER) +#define KPC_CLASS_RAWPMU_MASK (1u << KPC_CLASS_RAWPMU) + +#define COUNTERS_COUNT 10 +#define CONFIG_COUNT 8 +#define KPC_MASK (KPC_CLASS_CONFIGURABLE_MASK | KPC_CLASS_FIXED_MASK) + +#endif + +#ifdef BENCHMARK_OS_WINDOWS +#include +#endif + +#ifdef BENCHMARK_OS_ZOS +#include +#endif + +#include +#ifdef BENCHMARK_STL_ANDROID_GNUSTL +#include +#endif + +#ifdef BENCHMARK_OS_WINDOWS +#include +#undef StrCat // Don't let StrCat in string_util.h be renamed to lstrcatA +#include +#include +#include +#else +#include +#ifndef BENCHMARK_OS_FUCHSIA +#include +#endif +#include +#include // this header must be included before 'sys/sysctl.h' to avoid compilation error on FreeBSD +#include +#if defined BENCHMARK_OS_FREEBSD || defined BENCHMARK_OS_MACOSX || defined BENCHMARK_OS_NETBSD || \ + defined BENCHMARK_OS_OPENBSD || defined BENCHMARK_OS_DRAGONFLY +#define BENCHMARK_HAS_SYSCTL +#include +#endif +#endif +#if defined(BENCHMARK_OS_SOLARIS) +#include +#endif +#if defined(BENCHMARK_OS_QNX) +#include +#endif + +#define SOURCE_DIR "${CMAKE_SOURCE_DIR}" +#define CMAKE_OSX_ARCHITECTURES "${CMAKE_OSX_ARCHITECTURES}" +#define CMAKE_HOST_SYSTEM_PROCESSOR "${CMAKE_HOST_SYSTEM_PROCESSOR}" +#define CMAKE_SYSTEM_PROCESSOR "${CMAKE_SYSTEM_PROCESSOR}" +#define CMAKE_HOST_SYSTEM_NAME "${CMAKE_HOST_SYSTEM_NAME}" +#define CMAKE_SYSTEM_NAME "${CMAKE_SYSTEM_NAME}" +#define CMAKE_C_COMPILER "${CMAKE_C_COMPILER}" +#define CMAKE_CXX_COMPILER "${CMAKE_CXX_COMPILER}" +#define CMAKE_CXX_COMPILER_ID "${CMAKE_CXX_COMPILER_ID}" +#define CMAKE_CXX_COMPILER_VERSION "${CMAKE_CXX_COMPILER_VERSION}" +#define CMAKE_CROSSCOMPILING "${CMAKE_CROSSCOMPILING}" +#define CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}" +#define CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}" +#define CMAKE_BUILD_TYPE "${CMAKE_BUILD_TYPE}" +#define CMAKE_TOOLCHAIN_FILE "${CMAKE_TOOLCHAIN_FILE}" + +#define TARGET_NAME "${TARGET_NAME}" +#define TARGET_COMPILE_OPTIONS "${TARGET_COMPILE_OPTIONS}" + +/*---------------------------------------------------------------------------------------------------------------------\ + * Lib: +\---------------------------------------------------------------------------------------------------------------------*/ +namespace benchmark { +/* From: https://github.com/WojciechMula/toys/blob/master/000helpers/linux-perf-events.h + * Now api has been added to be compatible with the rest of fls_bench. + */ +#if defined(__linuxm__) +namespace perf { +template +class LinuxEvents { + + int fd; + perf_event_attr attribs; + bool running; + +public: + LinuxEvents(int config) + : fd(0) { + memset(&attribs, 0, sizeof(attribs)); + attribs.type = TYPE; + attribs.size = sizeof(attribs); + attribs.config = config; + attribs.disabled = 1; + attribs.exclude_kernel = 1; + attribs.exclude_hv = 1; + + const int pid = 0; // the current process + const int cpu = -1; // all CPUs + const int group = -1; // no group + const unsigned long flags = 0; + fd = syscall(__NR_perf_event_open, &attribs, pid, cpu, group, flags); + if (fd == -1) { report_error("perf_event_open"); } + + running = false; + } + + ~LinuxEvents() { close(fd); } + + void start() { + if (ioctl(fd, PERF_EVENT_IOC_RESET, 0) == -1) { report_error("ioctl(PERF_EVENT_IOC_RESET)"); } + + if (ioctl(fd, PERF_EVENT_IOC_ENABLE, 0) == -1) { report_error("ioctl(PERF_EVENT_IOC_ENABLE)"); } + } + + unsigned long end() { + if (ioctl(fd, PERF_EVENT_IOC_DISABLE, 0) == -1) { report_error("ioctl(PERF_EVENT_IOC_DISABLE)"); } + + unsigned long result; + if (read(fd, &result, sizeof(result)) == -1) { report_error("read"); } + + return result; + } + + unsigned long now() { + if (!running) { + if (ioctl(fd, PERF_EVENT_IOC_RESET, 0) == -1) { report_error("ioctl(PERF_EVENT_IOC_RESET)"); } + + if (ioctl(fd, PERF_EVENT_IOC_ENABLE, 0) == -1) { report_error("ioctl(PERF_EVENT_IOC_ENABLE)"); } + + running = true; + return 0; + } else { + if (ioctl(fd, PERF_EVENT_IOC_DISABLE, 0) == -1) { report_error("ioctl(PERF_EVENT_IOC_DISABLE)"); } + + unsigned long result; + if (read(fd, &result, sizeof(result)) == -1) { report_error("read"); } + + running = false; + return result; + } + } + +private: + void report_error(const std::string& context) { + throw std::runtime_error(context + ": " + std::string(strerror(errno))); + } +}; + +} // namespace perf + +perf::LinuxEvents cycles(PERF_COUNT_HW_CPU_CYCLES); +#endif + +// NOTE: only i386 and x86_64 have been well tested. +// PPC, sparc, alpha, and ia64 are based on +// http://peter.kuscsik.com/wordpress/?p=14 +// with modifications by m3b. See also +// https://setisvn.ssl.berkeley.edu/svn/lib/fftw-3.0.1/kernel/cycle.h +namespace cycleclock { + +#if defined(__aarch64__) +#if defined(__APPLE__) +static uint64_t g_counters[COUNTERS_COUNT]; +static uint64_t g_config[COUNTERS_COUNT]; +#endif +#endif + +FLS_BENCH_MAYBE_UNUSED static uint64_t get_counters() { +#if defined(__aarch64__) +#if defined(__APPLE__) + static bool WARNED = false; + if (kpc_get_thread_counters(0, COUNTERS_COUNT, g_counters)) { + if (!WARNED) { + printf("kpc_get_thread_counters failed, run as sudo?\n"); + WARNED = true; + } + return 1; + } + // g_counters[3 + 2] gives you the number of instructions 'decoded' + // whereas g_counters[1] might give you the number of instructions 'retired'. + return g_counters[0 + 2]; +#endif +#endif + return 0; +} + +FLS_BENCH_MAYBE_UNUSED static void configure_rdtsc() { +#if defined(__aarch64__) +#if defined(__APPLE__) + if (kpc_set_config(KPC_MASK, g_config)) { + printf("kpc_set_config failed\n"); + return; + } + + if (kpc_force_all_ctrs_set(1)) { + printf("kpc_force_all_ctrs_set failed\n"); + return; + } + + if (kpc_set_counting(KPC_MASK)) { + printf("kpc_set_counting failed\n"); + return; + } + + if (kpc_set_thread_counting(KPC_MASK)) { + printf("kpc_set_thread_counting failed\n"); + return; + } +#endif +#endif +} + +static void Init() { +#if defined(__aarch64__) +#if defined(__APPLE__) + void* kperf = dlopen("/System/Library/PrivateFrameworks/kperf.framework/Versions/A/kperf", RTLD_LAZY); + if (!kperf) { + printf("kperf = %p\n", kperf); + return; + } +#define F(ret, name, ...) \ + name = (name##proc*)(dlsym(kperf, #name)); \ + if (!name) { \ + printf("%s = %p\n", #name, (void*)name); \ + return; \ + } + KPERF_LIST +#undef F + + if (kpc_get_counter_count(KPC_MASK) != COUNTERS_COUNT) { + printf("wrong fixed counters count\n"); + return; + } + + if (kpc_get_config_count(KPC_MASK) != CONFIG_COUNT) { + printf("wrong fixed config count\n"); + return; + } + g_config[0] = CPMU_CORE_CYCLE | CFGWORD_EL0A64EN_MASK; + g_config[3] = CPMU_INST_BRANCH | CFGWORD_EL0A64EN_MASK; + g_config[4] = CPMU_SYNC_BR_ANY_MISP | CFGWORD_EL0A64EN_MASK; + g_config[5] = CPMU_INST_A64 | CFGWORD_EL0A64EN_MASK; + + configure_rdtsc(); +#endif +#endif +} +static uint64_t get_counters(); +// This should return the number of cycles since power-on. Thread-safe. +inline BENCHMARK_ALWAYS_INLINE int64_t Now() { + // #if defined(BENCHMARK_OS_MACOSX) + // // this goes at the top because we need ALL Macs, regardless of + // // architecture, to return the number of "mach time units" that + // // have passed since startup. See sysinfo.cc where + // // InitializeSystemInfo() sets the supposed cpu clock frequency of + // // macs to the number of mach time units per second, not actual + // // CPU clock frequency (which can change in the face of CPU + // // frequency scaling). Also note that when the Mac sleeps, this + // // counter pauses; it does not continue counting, nor does it + // // reset to zero. + // return mach_absolute_time(); + // #el +#if defined(BENCHMARK_OS_EMSCRIPTEN) + // this goes above x86-specific code because old versions of Emscripten + // define __x86_64__, although they have nothing to do with it. + // return static_cast(emscripten_get_now() * 1e+6); + + return std::chrono::high_resolution_clock::now().time_since_epoch().count(); +#elif defined(__i386__) + int64_t ret; + __asm__ volatile("rdtsc" : "=A"(ret)); + return ret; +#elif defined(__x86_64__) || defined(__amd64__) + uint64_t low, high; + __asm__ volatile("rdtsc" : "=a"(low), "=d"(high)); + return (high << 32) | low; +#elif defined(__powerpc__) || defined(__ppc__) + // This returns a time-base, which is not always precisely a cycle-count. +#if defined(__powerpc64__) || defined(__ppc64__) + int64_t tb; + asm volatile("mfspr %0, 268" : "=r"(tb)); + return tb; +#else + uint32_t tbl, tbu0, tbu1; + asm volatile("mftbu %0\n" + "mftb %1\n" + "mftbu %2" + : "=r"(tbu0), "=r"(tbl), "=r"(tbu1)); + tbl &= -static_cast(tbu0 == tbu1); + // high 32 bits in tbu1; low 32 bits in tbl (tbu0 is no longer needed) + return (static_cast(tbu1) << 32) | tbl; +#endif +#elif defined(__sparc__) + int64_t tick; + asm(".byte 0x83, 0x41, 0x00, 0x00"); + asm("mov %%g1, %0" : "=r"(tick)); + return tick; +#elif defined(__ia64__) + int64_t itc; + asm("mov %0 = ar.itc" : "=r"(itc)); + return itc; +#elif defined(COMPILER_MSVC) && defined(_M_IX86) + // Older MSVC compilers (like 7.x) don't seem to support the + // __rdtsc intrinsic properly, so I prefer to use _asm instead + // when I know it will work. Otherwise, I'll use __rdtsc and hope + // the code is being compiled with a non-ancient compiler. + _asm rdtsc +#elif defined(COMPILER_MSVC) && defined(_M_ARM64) + // See + // https://docs.microsoft.com/en-us/cpp/intrinsics/arm64-intrinsics?view=vs-2019 + // and https://reviews.llvm.org/D53115 + int64_t virtual_timer_value; + virtual_timer_value = _ReadStatusReg(ARM64_CNTVCT); + return virtual_timer_value; +#elif defined(COMPILER_MSVC) + return __rdtsc(); +#elif defined(BENCHMARK_OS_NACL) + // Native Client validator on x86/x86-64 allows RDTSC instructions, + // and this case is handled above. Native Client validator on ARM + // rejects MRC instructions (used in the ARM-specific sequence below), + // so we handle it here. Portable Native Client compiles to + // architecture-agnostic bytecode, which doesn't provide any + // cycle counter access mnemonics. + + // Native Client does not provide any API to access cycle counter. + // Use clock_gettime(CLOCK_MONOTONIC, ...) instead of gettimeofday + // because is provides nanosecond resolution (which is noticable at + // least for PNaCl modules running on x86 Mac & Linux). + // Initialize to always return 0 if clock_gettime fails. + struct timespec ts = {0, 0}; + clock_gettime(CLOCK_MONOTONIC, &ts); + return static_cast(ts.tv_sec) * 1000000000 + ts.tv_nsec; +#elif defined(__aarch64__) + // System timer of ARMv8 runs at a different frequency than the CPU's. + // The frequency is fixed, typically in the range 1-50MHz. It can be + // read at CNTFRQ special register. We assume the OS has set up + // the virtual timer properly. + // int64_t virtual_timer_value; + // asm volatile("mrs %0, cntvct_el0" : "=r"(virtual_timer_value)); + // return virtual_timer_value; +#if defined(__APPLE__) + return get_counters(); +#else + return cycles.now(); +#endif + +#elif defined(__ARM_ARCH) + // V6 is the earliest arch that has a standard cyclecount + // Native Client validator doesn't allow MRC instructions. +#if (__ARM_ARCH >= 6) + uint32_t pmccntr; + uint32_t pmuseren; + uint32_t pmcntenset; + // Read the user mode perf monitor counter access permissions. + asm volatile("mrc p15, 0, %0, c9, c14, 0" : "=r"(pmuseren)); + if (pmuseren & 1) // Allows reading perfmon counters for user mode code. + { + asm volatile("mrc p15, 0, %0, c9, c12, 1" : "=r"(pmcntenset)); + if (pmcntenset & 0x80000000ul) // Is it counting? + { + asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r"(pmccntr)); + // The counter is set up to count every 64th cycle + return static_cast(pmccntr) * 64; // Should optimize to << 6 + } + } +#endif + struct timeval tv; + gettimeofday(&tv, nullptr); + return static_cast(tv.tv_sec) * 1000000 + tv.tv_usec; +#elif defined(__mips__) || defined(__m68k__) + // mips apparently only allows rdtsc for superusers, so we fall + // back to gettimeofday. It's possible clock_gettime would be better. + struct timeval tv; + gettimeofday(&tv, nullptr); + return static_cast(tv.tv_sec) * 1000000 + tv.tv_usec; +#elif defined(__s390__) // Covers both s390 and s390x. + // Return the CPU clock. + uint64_t tsc; +#if defined(BENCHMARK_OS_ZOS) && defined(COMPILER_IBMXL) + // z/OS XL compiler HLASM syntax. + asm(" stck %0" : "=m"(tsc) : : "cc"); +#else + asm("stck %0" : "=Q"(tsc) : : "cc"); +#endif + return tsc; +#elif defined(__riscv) // RISC-V + // Use RDCYCLE (and RDCYCLEH on riscv32) +#if __riscv_xlen == 32 + uint32_t cycles_lo, cycles_hi0, cycles_hi1; + // This asm also includes the PowerPC overflow handling strategy, as above. + // Implemented in assembly because Clang insisted on branching. + asm volatile("rdcycleh %0\n" + "rdcycle %1\n" + "rdcycleh %2\n" + "sub %0, %0, %2\n" + "seqz %0, %0\n" + "sub %0, zero, %0\n" + "and %1, %1, %0\n" + : "=r"(cycles_hi0), "=r"(cycles_lo), "=r"(cycles_hi1)); + return (static_cast(cycles_hi1) << 32) | cycles_lo; +#else + uint64_t cycles; + asm volatile("rdcycle %0" : "=r"(cycles)); + return cycles; +#endif +#elif defined(__e2k__) || defined(__elbrus__) + struct timeval tv; + gettimeofday(&tv, nullptr); + return static_cast(tv.tv_sec) * 1000000 + tv.tv_usec; +#else + // The soft failover to a generic implementation is automatic only for ARM. + // For other platforms the developer is expected to make an attempt to create + // a fast implementation and use generic version if nothing better is available. +#error You need to define CycleTimer for your OS and CPU + // return + // std::chrono::high_resolution_clock::now().time_since_epoch().count(); + +#endif +} +} // end namespace cycleclock + +namespace timer { +inline uint64_t Now() { return std::chrono::high_resolution_clock::now().time_since_epoch().count(); } +} // namespace timer + +const int kNumMillisPerSecond = 1000; +const int kNumMicrosPerMilli = 1000; +const int kNumMicrosPerSecond = kNumMillisPerSecond * 1000; +const int kNumNanosPerMicro = 1000; +const int kNumNanosPerSecond = kNumNanosPerMicro * kNumMicrosPerSecond; + +#ifdef BENCHMARK_OS_WINDOWS +// Window's Sleep takes milliseconds argument. +void SleepForMilliseconds(int milliseconds) { Sleep(milliseconds); } +void SleepForSeconds(double seconds) { SleepForMilliseconds(static_cast(kNumMillisPerSecond * seconds)); } +#else // BENCHMARK_OS_WINDOWS +static void SleepForMicroseconds(int microseconds) { +#ifdef BENCHMARK_OS_ZOS + // z/OS does not support nanosleep. Instead call sleep() and then usleep() to + // sleep for the remaining microseconds because usleep() will fail if its + // argument is greater than 1000000. + div_t sleepTime = div(microseconds, kNumMicrosPerSecond); + int seconds = sleepTime.quot; + while (seconds != 0) { + seconds = sleep(seconds); + } + while (usleep(sleepTime.rem) == -1 && errno == EINTR) + ; +#else + struct timespec sleep_time; + sleep_time.tv_sec = microseconds / kNumMicrosPerSecond; + sleep_time.tv_nsec = (microseconds % kNumMicrosPerSecond) * kNumNanosPerMicro; + while (nanosleep(&sleep_time, &sleep_time) != 0 && errno == EINTR) + ; // Ignore signals and wait for the full interval to elapse. +#endif +} + +static void SleepForMilliseconds(int milliseconds) { SleepForMicroseconds(milliseconds * kNumMicrosPerMilli); } + +FLS_BENCH_MAYBE_UNUSED static void SleepForSeconds(double seconds) { + SleepForMicroseconds(static_cast(seconds * kNumMicrosPerSecond)); +} +#endif // BENCHMARK_OS_WINDOWS + +namespace internal { +// The arraysize(arr) macro returns the # of elements in an array arr. +// The expression is a compile-time constant, and therefore can be +// used in defining new arrays, for example. If you use arraysize on +// a pointer by mistake, you will get a compile-time error. +// + +// This template function declaration is used in defining arraysize. +// Note that the function doesn't need an implementation, as we only +// use its type. +template +char (&ArraySizeHelper(T (&array)[N]))[N]; + +// That gcc wants both of these prototypes seems mysterious. VC, for +// its part, can't decide which to use (another mystery). Matching of +// template overloads: the final frontier. +#ifndef COMPILER_MSVC +template +char (&ArraySizeHelper(const T (&array)[N]))[N]; +#endif + +#define arraysize(array) (sizeof(::benchmark::internal::ArraySizeHelper(array))) + +} // namespace internal + +// kilo, Mega, Giga, Tera, Peta, Exa, Zetta, Yotta. +const char kBigSIUnits[] = "kMGTPEZY"; +// Kibi, Mebi, Gibi, Tebi, Pebi, Exbi, Zebi, Yobi. +const char kBigIECUnits[] = "KMGTPEZY"; +// milli, micro, nano, pico, femto, atto, zepto, yocto. +const char kSmallSIUnits[] = "munpfazy"; + +// We require that all three arrays have the same size. +static_assert(arraysize(kBigSIUnits) == arraysize(kBigIECUnits), "SI and IEC unit arrays must be the same size"); +static_assert(arraysize(kSmallSIUnits) == arraysize(kBigSIUnits), + "Small SI and Big SI unit arrays must be the same size"); + +static const int64_t kUnitsSize = arraysize(kBigSIUnits); + +static void ToExponentAndMantissa( + double val, double thresh, int precision, double one_k, std::string* mantissa, int64_t* exponent) { + std::stringstream mantissa_stream; + + if (val < 0) { + mantissa_stream << "-"; + val = -val; + } + + // Adjust threshold so that it never excludes things which can't be rendered + // in 'precision' digits. + const double adjusted_threshold = std::max(thresh, 1.0 / std::pow(10.0, precision)); + const double big_threshold = adjusted_threshold * one_k; + const double small_threshold = adjusted_threshold; + // Values in ]simple_threshold,small_threshold[ will be printed as-is + const double simple_threshold = 0.01; + + if (val > big_threshold) { + // Positive powers + double scaled = val; + for (size_t i = 0; i < arraysize(kBigSIUnits); ++i) { + scaled /= one_k; + if (scaled <= big_threshold) { + mantissa_stream << scaled; + *exponent = i + 1; + *mantissa = mantissa_stream.str(); + return; + } + } + mantissa_stream << val; + *exponent = 0; + } else if (val < small_threshold) { + // Negative powers + if (val < simple_threshold) { + double scaled = val; + for (size_t i = 0; i < arraysize(kSmallSIUnits); ++i) { + scaled *= one_k; + if (scaled >= small_threshold) { + mantissa_stream << scaled; + *exponent = -static_cast(i + 1); + *mantissa = mantissa_stream.str(); + return; + } + } + } + mantissa_stream << val; + *exponent = 0; + } else { + mantissa_stream << val; + *exponent = 0; + } + *mantissa = mantissa_stream.str(); +} + +static std::string ExponentToPrefix(int64_t exponent, bool iec) { + if (exponent == 0) { return ""; } + + const int64_t index = (exponent > 0 ? exponent - 1 : -exponent - 1); + if (index >= kUnitsSize) { return ""; } + + const char* array = (exponent > 0 ? (iec ? kBigIECUnits : kBigSIUnits) : kSmallSIUnits); + if (iec) { + return array[index] + std::string("i"); + } else { + return std::string(1, array[index]); + } +} + +static std::string ToBinaryStringFullySpecified(double value, double threshold, int precision, double one_k = 1024.0) { + std::string mantissa; + int64_t exponent; + ToExponentAndMantissa(value, threshold, precision, one_k, &mantissa, &exponent); + return mantissa + ExponentToPrefix(exponent, false); +} + +FLS_BENCH_MAYBE_UNUSED static void AppendHumanReadable(int n, std::string* str) { + std::stringstream ss; + // Round down to the nearest SI prefix. + ss << ToBinaryStringFullySpecified(n, 1.0, 0); + *str += ss.str(); +} + +FLS_BENCH_MAYBE_UNUSED static std::string HumanReadableNumber(double n, double one_k = 1024.0) { + // 1.1 means that figures up to 1.1k should be shown with the next unit down; + // this softens edge effects. + // 1 means that we should show one decimal place of precision. + return ToBinaryStringFullySpecified(n, 1.1, 1, one_k); +} + +static std::string StrFormatImp(const char* msg, va_list args) { + // we might need a second shot at this, so pre-emptivly make a copy + va_list args_cp; + va_copy(args_cp, args); + + // TODO(ericwf): use std::array for first attempt to avoid one memory + // allocation guess what the size might be + std::array local_buff; + std::size_t size = local_buff.size(); + // 2015-10-08: vsnprintf is used instead of snd::vsnprintf due to a limitation + // in the android-ndk + auto ret = vsnprintf(local_buff.data(), size, msg, args_cp); + + va_end(args_cp); + + // handle empty expansion + if (ret == 0) return std::string {}; + if (static_cast(ret) < size) { return std::string(local_buff.data()); } + + // we did not provide a long enough buffer on our first attempt. + // add 1 to size to account for null-byte in size cast to prevent overflow + size = static_cast(ret) + 1; + auto buff_ptr = std::unique_ptr(new char[size]); + // 2015-10-08: vsnprintf is used instead of snd::vsnprintf due to a limitation + // in the android-ndk + ret = vsnprintf(buff_ptr.get(), size, msg, args); + return std::string(buff_ptr.get()); +} + +#if defined(__MINGW32__) +__attribute__((format(__MINGW_PRINTF_FORMAT, 1, 2))) +#elif defined(__GNUC__) +__attribute__((format(printf, 1, 2))) +#endif + +static std::string +StrFormat(const char* format, ...) { + va_list args; + va_start(args, format); + std::string tmp = StrFormatImp(format, args); + va_end(args); + return tmp; +} + +inline std::ostream& StrCatImp(std::ostream& out) { return out; } + +template +inline std::ostream& StrCatImp(std::ostream& out, First&& f, Rest&&... rest) { + out << std::forward(f); + return StrCatImp(out, std::forward(rest)...); +} + +template +inline std::string StrCat(Args&&... args) { + std::ostringstream ss; + StrCatImp(ss, std::forward(args)...); + return ss.str(); +} + +std::vector StrSplit(const std::string& str, char delim); + +#ifdef BENCHMARK_STL_ANDROID_GNUSTL +/* + * GNU STL in Android NDK lacks support for some C++11 functions, including + * stoul, stoi, stod. We reimplement them here using C functions strtoul, + * strtol, strtod. Note that reimplemented functions are in benchmark:: + * namespace, not std:: namespace. + */ +unsigned long stoul(const std::string& str, size_t* pos = nullptr, int base = 10); +int stoi(const std::string& str, size_t* pos = nullptr, int base = 10); +double stod(const std::string& str, size_t* pos = nullptr); +#else +using std::stod; +using std::stoi; +using std::stoul; +#endif + +class CPUInfo { +public: + struct CacheInfo { + std::string type; + int level; + int size; + int num_sharing; + }; + + enum Scaling { UNKNOWN, ENABLED, DISABLED }; + + static inline std::string ToString(Scaling v) { + switch (v) { + case UNKNOWN: + return "UNKNOWN"; + case ENABLED: + return "ENABLED"; + case DISABLED: + return "DISABLED"; + default: + return "UNKNOWN"; + } + } + + // Getters + static const CPUInfo& getInstance() { + static const CPUInfo info; + return info; + } + int getNumCpus() const { return num_cpus; }; + double getCyclesPerSecond() const { return cycles_per_second; }; + const std::vector& getCaches() const { return caches; }; + const std::vector& getLoadAvg() const { return load_avg; }; + std::string getScaling() const { return ToString(scaling); }; + + int num_cpus; + Scaling scaling; + double cycles_per_second; + std::vector caches; + std::vector load_avg; + +private: + // private constructor + CPUInfo(); + + BENCHMARK_DISALLOW_COPY_AND_ASSIGN(CPUInfo); +}; + +static void PrintImp(std::ostream& out) { out << std::endl; } + +template +void PrintImp(std::ostream& out, First&& f, Rest&&... rest) { + out << std::forward(f); + PrintImp(out, std::forward(rest)...); +} + +template +BENCHMARK_NORETURN void PrintErrorAndDie(ARGS&&... args) { + PrintImp(std::cerr, std::forward(args)...); + std::exit(EXIT_FAILURE); +} + +#ifdef BENCHMARK_HAS_SYSCTL + +/// ValueUnion - A type used to correctly alias the byte-for-byte output of +/// `sysctl` with the result type it's to be interpreted as. +struct ValueUnion { + union DataT { + uint32_t uint32_value; + uint64_t uint64_value; + // For correct aliasing of union members from bytes. + char bytes[8]; + }; + using DataPtr = std::unique_ptr; + + // The size of the data union member + its trailing array size. + size_t Size; + DataPtr Buff; + +public: + ValueUnion() + : Size(0) + , Buff(nullptr, &std::free) {} + + explicit ValueUnion(size_t buff_size) + : Size(sizeof(DataT) + buff_size) + , Buff(::new(std::malloc(Size)) DataT(), &std::free) {} + + ValueUnion(ValueUnion&& other) = default; + explicit operator bool() const { return bool(Buff); } + char* data() const { return Buff->bytes; } + std::string GetAsString() const { return {data()}; } + int64_t GetAsInteger() const { + if (Size == sizeof(Buff->uint32_value)) { + return static_cast(Buff->uint32_value); + } else if (Size == sizeof(Buff->uint64_value)) { + return static_cast(Buff->uint64_value); + } + BENCHMARK_UNREACHABLE(); + } + uint64_t GetAsUnsigned() const { + if (Size == sizeof(Buff->uint32_value)) { + return Buff->uint32_value; + } else if (Size == sizeof(Buff->uint64_value)) { + return Buff->uint64_value; + } + BENCHMARK_UNREACHABLE(); + } + template + std::array GetAsArray() { + const int ArrSize = sizeof(T) * N; + // CHECK_LE(ArrSize, Size); + std::array arr; + std::memcpy(arr.data(), data(), ArrSize); + return arr; + } +}; + +static ValueUnion GetSysctlImp(std::string const& name) { +#if defined BENCHMARK_OS_OPENBSD + int mib[2]; + + mib[0] = CTL_HW; + if ((Name == "hw.ncpu") || (Name == "hw.cpuspeed")) { + ValueUnion buff(sizeof(int)); + + if (Name == "hw.ncpu") { + mib[1] = HW_NCPU; + } else { + mib[1] = HW_CPUSPEED; + } + + if (sysctl(mib, 2, buff.data(), &buff.Size, nullptr, 0) == -1) { return ValueUnion(); } + return buff; + } + return ValueUnion(); +#else + size_t cur_buff_size = 0; + if (sysctlbyname(name.c_str(), nullptr, &cur_buff_size, nullptr, 0) == -1) { return {}; } + + ValueUnion buff(cur_buff_size); + if (sysctlbyname(name.c_str(), buff.data(), &buff.Size, nullptr, 0) == 0) { return buff; } + return {}; +#endif +} + +FLS_BENCH_MAYBE_UNUSED static bool GetSysctl(std::string const& name, std::string* out) { + out->clear(); + auto buff = GetSysctlImp(name); + if (!buff) { return false; } + out->assign(buff.data()); + return true; +} + +template ::value>::type> +bool GetSysctl(std::string const& name, TP* Out) { + *Out = 0; + auto buff = GetSysctlImp(name); + if (!buff) { return false; } + *Out = static_cast(buff.GetAsUnsigned()); + return true; +} + +template +bool GetSysctl(std::string const& name, std::array* Out) { + auto buff = GetSysctlImp(name); + if (!buff) { return false; } + *Out = buff.GetAsArray(); + return true; +} +#endif + +template +bool ReadFromFile(std::string const& fname, ARG_T* arg) { + *arg = ARG_T(); + std::ifstream f(fname.c_str()); + if (!f.is_open()) { return false; } + f >> *arg; + return f.good(); +} + +static CPUInfo::Scaling CpuScaling(int num_cpus) { + // We don't have a valid CPU count, so don't even bother. + if (num_cpus <= 0) { return CPUInfo::Scaling::UNKNOWN; } +#ifdef BENCHMARK_OS_QNX + return CPUInfo::Scaling::UNKNOWN; +#endif +#ifndef BENCHMARK_OS_WINDOWS + // On Linux, the CPUfreq subsystem exposes CPU information as files on the + // local file system. If reading the exported files fails, then we may not be + // running on Linux, so we silently ignore all the read errors. + std::string res; + for (int cpu = 0; cpu < num_cpus; ++cpu) { + std::string governor_file = StrCat("/sys/devices/system/cpu/cpu", cpu, "/cpufreq/scaling_governor"); + if (ReadFromFile(governor_file, &res) && res != "performance") { return CPUInfo::Scaling::ENABLED; } + } + return CPUInfo::Scaling::DISABLED; +#endif + return CPUInfo::Scaling::UNKNOWN; +} + +static int CountSetBitsInCPUMap(std::string val) { + auto count_bits = [](std::string Part) { + using CPUMask = std::bitset; + Part = "0x" + Part; + CPUMask mask(benchmark::stoul(Part, nullptr, 16)); + return static_cast(mask.count()); + }; + size_t pos; + int total = 0; + while ((pos = val.find(',')) != std::string::npos) { + total += count_bits(val.substr(0, pos)); + val = val.substr(pos + 1); + } + if (!val.empty()) { total += count_bits(val); } + return total; +} + +FLS_BENCH_MAYBE_UNUSED +static std::vector GetCacheSizesFromKVFS() { + std::vector res; + std::string dir = "/sys/devices/system/cpu/cpu0/cache/"; + int idx = 0; + while (true) { + CPUInfo::CacheInfo info; + std::string f_path = StrCat(dir, "index", idx++, "/"); + std::ifstream f(StrCat(f_path, "size").c_str()); + if (!f.is_open()) { break; } + std::string suffix; + f >> info.size; + if (f.fail()) { PrintErrorAndDie("Failed while reading file '", f_path, "size'"); } + if (f.good()) { + f >> suffix; + if (f.bad()) { + PrintErrorAndDie("Invalid cache size format: failed to read size suffix"); + } else if (f && suffix != "K") { + PrintErrorAndDie("Invalid cache size format: Expected bytes ", suffix); + } else if (suffix == "K") { + info.size *= 1024; + } + } + if (!ReadFromFile(StrCat(f_path, "type"), &info.type)) { + PrintErrorAndDie("Failed to read from file ", f_path, "type"); + } + if (!ReadFromFile(StrCat(f_path, "level"), &info.level)) { + PrintErrorAndDie("Failed to read from file ", f_path, "level"); + } + std::string map_str; + if (!ReadFromFile(StrCat(f_path, "shared_cpu_map"), &map_str)) { + PrintErrorAndDie("Failed to read from file ", f_path, "shared_cpu_map"); + } + info.num_sharing = CountSetBitsInCPUMap(map_str); + res.push_back(info); + } + + return res; +} + +#ifdef BENCHMARK_OS_MACOSX +std::vector GetCacheSizesMacOSX() { + std::vector res; + std::array cache_counts {{0, 0, 0, 0}}; + GetSysctl("hw.cacheconfig", &cache_counts); + + struct { + std::string name; + std::string type; + int level; + uint64_t num_sharing; + } Cases[] = {{"hw.l1dcachesize", "Data", 1, cache_counts[1]}, + {"hw.l1icachesize", "Instruction", 1, cache_counts[1]}, + {"hw.l2cachesize", "Unified", 2, cache_counts[2]}, + {"hw.l3cachesize", "Unified", 3, cache_counts[3]}}; + for (auto& c : Cases) { + int val; + if (!GetSysctl(c.name, &val)) { continue; } + CPUInfo::CacheInfo info; + info.type = c.type; + info.level = c.level; + info.size = val; + info.num_sharing = static_cast(c.num_sharing); + res.push_back(std::move(info)); + } + return res; +} +#elif defined(BENCHMARK_OS_WINDOWS) +std::vector GetCacheSizesWindows() { + std::vector res; + DWORD buffer_size = 0; + using PInfo = SYSTEM_LOGICAL_PROCESSOR_INFORMATION; + using CInfo = CACHE_DESCRIPTOR; + + using UPtr = std::unique_ptr; + GetLogicalProcessorInformation(nullptr, &buffer_size); + UPtr buff((PInfo*)malloc(buffer_size), &std::free); + if (!GetLogicalProcessorInformation(buff.get(), &buffer_size)) + PrintErrorAndDie("Failed during call to GetLogicalProcessorInformation: ", GetLastError()); + + PInfo* it = buff.get(); + PInfo* end = buff.get() + (buffer_size / sizeof(PInfo)); + + for (; it != end; ++it) { + if (it->Relationship != RelationCache) { continue; } + using BitSet = std::bitset; + BitSet B(it->ProcessorMask); + // To prevent duplicates, only consider caches where CPU 0 is specified + if (!B.test(0)) { continue; } + CInfo* Cache = &it->Cache; + CPUInfo::CacheInfo C; + C.num_sharing = static_cast(B.count()); + C.level = Cache->Level; + C.size = Cache->Size; + switch (Cache->Type) { + case CacheUnified: + C.type = "Unified"; + break; + case CacheInstruction: + C.type = "Instruction"; + break; + case CacheData: + C.type = "Data"; + break; + case CacheTrace: + C.type = "Trace"; + break; + default: + C.type = "Unknown"; + break; + } + res.push_back(C); + } + return res; +} +#elif BENCHMARK_OS_QNX +std::vector GetCacheSizesQNX() { + std::vector res; + struct cacheattr_entry* cache = SYSPAGE_ENTRY(cacheattr); + uint32_t const elsize = SYSPAGE_ELEMENT_SIZE(cacheattr); + int num = SYSPAGE_ENTRY_SIZE(cacheattr) / elsize; + for (int i = 0; i < num; ++i) { + CPUInfo::CacheInfo info; + switch (cache->flags) { + case CACHE_FLAG_INSTR: + info.type = "Instruction"; + info.level = 1; + break; + case CACHE_FLAG_DATA: + info.type = "Data"; + info.level = 1; + break; + case CACHE_FLAG_UNIFIED: + info.type = "Unified"; + info.level = 2; + break; + case CACHE_FLAG_SHARED: + info.type = "Shared"; + info.level = 3; + break; + default: + continue; + break; + } + info.size = cache->line_size * cache->num_lines; + info.num_sharing = 0; + res.push_back(std::move(info)); + cache = SYSPAGE_ARRAY_ADJ_OFFSET(cacheattr, cache, elsize); + } + return res; +} +#endif + +static std::vector GetCacheSizes() { +#ifdef BENCHMARK_OS_MACOSX + return GetCacheSizesMacOSX(); +#elif defined(BENCHMARK_OS_WINDOWS) + return GetCacheSizesWindows(); +#elif defined(BENCHMARK_OS_QNX) + return GetCacheSizesQNX(); +#else + return GetCacheSizesFromKVFS(); +#endif +} + +FLS_BENCH_MAYBE_UNUSED std::string GetSystemName() { +#if defined(BENCHMARK_OS_WINDOWS) + std::string str; + const unsigned COUNT = MAX_COMPUTERNAME_LENGTH + 1; + TCHAR hostname[COUNT] = {'\0'}; + DWORD DWCOUNT = COUNT; + if (!GetComputerName(hostname, &DWCOUNT)) { return std::string(""); } +#ifndef UNICODE + str = std::string(hostname, DWCOUNT); +#else + // Using wstring_convert, Is deprecated in C++17 + using convert_type = std::codecvt_utf8; + std::wstring_convert converter; + std::wstring wStr(hostname, DWCOUNT); + str = converter.to_bytes(wStr); +#endif + return str; +#else // defined(BENCHMARK_OS_WINDOWS) +#ifndef HOST_NAME_MAX +#ifdef BENCHMARK_HAS_SYSCTL // BSD/Mac Doesnt have HOST_NAME_MAX defined +#define HOST_NAME_MAX 64 +#elif defined(BENCHMARK_OS_NACL) +#define HOST_NAME_MAX 64 +#elif defined(BENCHMARK_OS_QNX) +#define HOST_NAME_MAX 154 +#elif defined(BENCHMARK_OS_RTEMS) +#define HOST_NAME_MAX 256 +#else +#warning "HOST_NAME_MAX not defined. using 64" +#define HOST_NAME_MAX 64 +#endif +#endif // def HOST_NAME_MAX + char hostname[HOST_NAME_MAX]; + int retVal = gethostname(hostname, HOST_NAME_MAX); + if (retVal != 0) { return std::string(""); } + return std::string(hostname); +#endif // Catch-all POSIX block. +} + +static int GetNumCPUs() { +#ifdef BENCHMARK_HAS_SYSCTL + int NumCPU = -1; + if (GetSysctl("hw.ncpu", &NumCPU)) { return NumCPU; } + fprintf(stderr, "Err: %s\n", strerror(errno)); + std::exit(EXIT_FAILURE); +#elif defined(BENCHMARK_OS_WINDOWS) + SYSTEM_INFO sysinfo; + // Use memset as opposed to = {} to avoid GCC missing initializer false + // positives. + std::memset(&sysinfo, 0, sizeof(SYSTEM_INFO)); + GetSystemInfo(&sysinfo); + return sysinfo.dwNumberOfProcessors; // number of logical + // processors in the current + // group +#elif defined(BENCHMARK_OS_SOLARIS) + // Returns -1 in case of a failure. + int NumCPU = sysconf(_SC_NPROCESSORS_ONLN); + if (NumCPU < 0) { fprintf(stderr, "sysconf(_SC_NPROCESSORS_ONLN) failed with error: %s\n", strerror(errno)); } + return NumCPU; +#elif defined(BENCHMARK_OS_QNX) + return static_cast(_syspage_ptr->num_cpu); +#else + int NumCPUs = 0; + int MaxID = -1; + std::ifstream f("/proc/cpuinfo"); + if (!f.is_open()) { + std::cerr << "failed to open /proc/cpuinfo\n"; + return -1; + } + const std::string Key = "processor"; + std::string ln; + while (std::getline(f, ln)) { + if (ln.empty()) { continue; } + size_t SplitIdx = ln.find(':'); + std::string value; +#if defined(__s390__) + // s390 has another format in /proc/cpuinfo + // it needs to be parsed differently + if (SplitIdx != std::string::npos) { value = ln.substr(Key.size() + 1, SplitIdx - Key.size() - 1); } +#else + if (SplitIdx != std::string::npos) { value = ln.substr(SplitIdx + 1); } +#endif + if (ln.size() >= Key.size() && ln.compare(0, Key.size(), Key) == 0) { + NumCPUs++; + if (!value.empty()) { + int CurID = benchmark::stoi(value); + MaxID = std::max(CurID, MaxID); + } + } + } + if (f.bad()) { + std::cerr << "Failure reading /proc/cpuinfo\n"; + return -1; + } + if (!f.eof()) { + std::cerr << "Failed to read to end of /proc/cpuinfo\n"; + return -1; + } + f.close(); + + if ((MaxID + 1) != NumCPUs) { + fprintf(stderr, + "CPU ID assignments in /proc/cpuinfo seem messed up." + " This is usually caused by a bad BIOS.\n"); + } + return NumCPUs; +#endif + BENCHMARK_UNREACHABLE(); +} + +static double GetCPUCyclesPerSecond(CPUInfo::Scaling scaling) { + // Currently, scaling is only used on linux path here, + // suppress diagnostics about it being unused on other paths. + (void)scaling; + +#if defined BENCHMARK_OS_LINUX || defined BENCHMARK_OS_CYGWIN + long freq; + + // If the kernel is exporting the tsc frequency use that. There are issues + // where cpuinfo_max_freq cannot be relied on because the BIOS may be + // exporintg an invalid p-state (on x86) or p-states may be used to put the + // processor in a new mode (turbo mode). Essentially, those frequencies + // cannot always be relied upon. The same reasons apply to /proc/cpuinfo as + // well. + if (ReadFromFile("/sys/devices/system/cpu/cpu0/tsc_freq_khz", &freq) + // If CPU scaling is disabled, use the the *current* frequency. + // Note that we specifically don't want to read cpuinfo_cur_freq, + // because it is only readable by root. + || (scaling == CPUInfo::Scaling::DISABLED && + ReadFromFile("/sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq", &freq)) + // Otherwise, if CPU scaling may be in effect, we want to use + // the *maximum* frequency, not whatever CPU speed some random processor + // happens to be using now. + || ReadFromFile("/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq", &freq)) { + // The value is in kHz (as the file name suggests). For example, on a + // 2GHz warpstation, the file contains the value "2000000". + return freq * 1000.0; + } + + const double error_value = -1; + double bogo_clock = error_value; + + std::ifstream f("/proc/cpuinfo"); + if (!f.is_open()) { + std::cerr << "failed to open /proc/cpuinfo\n"; + return error_value; + } + + auto startsWithKey = [](std::string const& Value, std::string const& Key) { + if (Key.size() > Value.size()) { return false; } + auto Cmp = [&](char X, char Y) { + return std::tolower(X) == std::tolower(Y); + }; + return std::equal(Key.begin(), Key.end(), Value.begin(), Cmp); + }; + + std::string ln; + while (std::getline(f, ln)) { + if (ln.empty()) { continue; } + size_t SplitIdx = ln.find(':'); + std::string value; + if (SplitIdx != std::string::npos) { value = ln.substr(SplitIdx + 1); } + // When parsing the "cpu MHz" and "bogomips" (fallback) entries, we only + // accept positive values. Some environments (virtual machines) report zero, + // which would cause infinite looping in WallTime_Init. + if (startsWithKey(ln, "cpu MHz")) { + if (!value.empty()) { + double cycles_per_second = benchmark::stod(value) * 1000000.0; + if (cycles_per_second > 0) { return cycles_per_second; } + } + } else if (startsWithKey(ln, "bogomips")) { + if (!value.empty()) { + bogo_clock = benchmark::stod(value) * 1000000.0; + if (bogo_clock < 0.0) { bogo_clock = error_value; } + } + } + } + if (f.bad()) { + std::cerr << "Failure reading /proc/cpuinfo\n"; + return error_value; + } + if (!f.eof()) { + std::cerr << "Failed to read to end of /proc/cpuinfo\n"; + return error_value; + } + f.close(); + // If we found the bogomips clock, but nothing better, we'll use it (but + // we're not happy about it); otherwise, fallback to the rough estimation + // below. + if (bogo_clock >= 0.0) { return bogo_clock; } + +#elif defined BENCHMARK_HAS_SYSCTL + constexpr auto* FreqStr = +#if defined(BENCHMARK_OS_FREEBSD) || defined(BENCHMARK_OS_NETBSD) + "machdep.tsc_freq"; +#elif defined BENCHMARK_OS_OPENBSD + "hw.cpuspeed"; +#elif defined BENCHMARK_OS_DRAGONFLY + "hw.tsc_frequency"; +#else + "hw.cpufrequency"; +#endif + unsigned long long hz = 0; +#if defined BENCHMARK_OS_OPENBSD + if (GetSysctl(FreqStr, &hz)) { return hz * 1000000; } +#else + if (GetSysctl(FreqStr, &hz)) { return hz; } +#endif + fprintf(stderr, "Unable to determine clock rate from sysctl: %s: %s\n", FreqStr, strerror(errno)); + +#elif defined BENCHMARK_OS_WINDOWS + // In NT, read MHz from the registry. If we fail to do so or we're in win9x + // then make a crude estimate. + DWORD data, data_size = sizeof(data); + if (IsWindowsXPOrGreater() && SUCCEEDED(SHGetValueA(HKEY_LOCAL_MACHINE, + "HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0", + "~MHz", + nullptr, + &data, + &data_size))) + return static_cast((int64_t)data * (int64_t)(1000 * 1000)); // was mhz +#elif defined(BENCHMARK_OS_SOLARIS) + kstat_ctl_t* kc = kstat_open(); + if (!kc) { + std::cerr << "failed to open /dev/kstat\n"; + return -1; + } + kstat_t* ksp = kstat_lookup(kc, (char*)"cpu_info", -1, (char*)"cpu_info0"); + if (!ksp) { + std::cerr << "failed to lookup in /dev/kstat\n"; + return -1; + } + if (kstat_read(kc, ksp, NULL) < 0) { + std::cerr << "failed to read from /dev/kstat\n"; + return -1; + } + kstat_named_t* knp = (kstat_named_t*)kstat_data_lookup(ksp, (char*)"current_clock_Hz"); + if (!knp) { + std::cerr << "failed to lookup data in /dev/kstat\n"; + return -1; + } + if (knp->data_type != KSTAT_DATA_UINT64) { + std::cerr << "current_clock_Hz is of unexpected data type: " << knp->data_type << "\n"; + return -1; + } + double clock_hz = knp->value.ui64; + kstat_close(kc); + return clock_hz; +#elif defined(BENCHMARK_OS_QNX) + return static_cast((int64_t)(SYSPAGE_ENTRY(cpuinfo)->speed) * (int64_t)(1000 * 1000)); +#endif + // If we've fallen through, attempt to roughly estimate the CPU clock rate. + const int estimate_time_ms = 1000; + cycleclock::Init(); + const auto start_ticks = cycleclock::Now(); + SleepForMilliseconds(estimate_time_ms); + return static_cast(cycleclock::Now() - start_ticks); +} + +static std::vector GetLoadAvg() { +#if (defined BENCHMARK_OS_FREEBSD || defined(BENCHMARK_OS_LINUX) || defined BENCHMARK_OS_MACOSX || \ + defined BENCHMARK_OS_NETBSD || defined BENCHMARK_OS_OPENBSD || defined BENCHMARK_OS_DRAGONFLY) && \ + !defined(__ANDROID__) + constexpr int kMaxSamples = 3; + std::vector res(kMaxSamples, 0.0); + const int nelem = getloadavg(res.data(), kMaxSamples); + if (nelem < 1) { + res.clear(); + } else { + res.resize(nelem); + } + return res; +#else + return {}; +#endif +} + +// private constructor +CPUInfo::CPUInfo() + : num_cpus(GetNumCPUs()) + , scaling(CpuScaling(num_cpus)) + , cycles_per_second(GetCPUCyclesPerSecond(scaling)) + , caches(GetCacheSizes()) + , load_avg(GetLoadAvg()) {} + +struct SystemInfo { + + static std::string GetSystemName() { +#if defined(BENCHMARK_OS_WINDOWS) + std::string str; + const unsigned COUNT = MAX_COMPUTERNAME_LENGTH + 1; + TCHAR hostname[COUNT] = {'\0'}; + DWORD DWCOUNT = COUNT; + if (!GetComputerName(hostname, &DWCOUNT)) { return std::string(""); } +#ifndef UNICODE + str = std::string(hostname, DWCOUNT); +#else + // Using wstring_convert, Is deprecated in C++17 + using convert_type = std::codecvt_utf8; + std::wstring_convert converter; + std::wstring wStr(hostname, DWCOUNT); + str = converter.to_bytes(wStr); +#endif + return str; +#else // defined(BENCHMARK_OS_WINDOWS) +#ifndef HOST_NAME_MAX +#ifdef BENCHMARK_HAS_SYSCTL // BSD/Mac Doesnt have HOST_NAME_MAX defined +#define HOST_NAME_MAX 64 +#elif defined(BENCHMARK_OS_NACL) +#define HOST_NAME_MAX 64 +#elif defined(BENCHMARK_OS_QNX) +#define HOST_NAME_MAX 154 +#elif defined(BENCHMARK_OS_RTEMS) +#define HOST_NAME_MAX 256 +#else +#warning "HOST_NAME_MAX not defined. using 64" +#define HOST_NAME_MAX 64 +#endif +#endif // def HOST_NAME_MAX + char hostname[HOST_NAME_MAX]; + int ret_val = gethostname(hostname, HOST_NAME_MAX); + if (ret_val != 0) { return std::string(""); } + return {hostname}; +#endif // Catch-all POSIX block. + } + + static const SystemInfo& getInstance() { + static const SystemInfo INFO; + return INFO; + } + +private: + SystemInfo() + : m_name(GetSystemName()) {} + +public: + const std::string& getName() const { return m_name; } + +private: + std::string m_name; + + BENCHMARK_DISALLOW_COPY_AND_ASSIGN(SystemInfo); +}; + +static std::string LocalDateTimeString() { + // Write the local time in RFC3339 format yyyy-mm-ddTHH:MM:SS+/-HH:MM. + using Clock = std::chrono::system_clock; + std::time_t now = Clock::to_time_t(Clock::now()); + const std::size_t kTzOffsetLen = 6; + const std::size_t kTimestampLen = 19; + + std::size_t tz_len; + std::size_t timestamp_len; + long int offset_minutes; + char tz_offset_sign = '+'; + // tz_offset is set in one of three ways: + // * strftime with %z - This either returns empty or the ISO 8601 time. The + // maximum length an + // ISO 8601 string can be is 7 (e.g. -03:30, plus trailing zero). + // * snprintf with %c%02li:%02li - The maximum length is 41 (one for %c, up to + // 19 for %02li, + // one for :, up to 19 %02li, plus trailing zero). + // * A fixed string of "-00:00". The maximum length is 7 (-00:00, plus + // trailing zero). + // + // Thus, the maximum size this needs to be is 41. + char tz_offset[41]; + // Long enough buffer to avoid format-overflow warnings + char storage[128]; + +#if defined(BENCHMARK_OS_WINDOWS) + std::tm* timeinfo_p = ::localtime(&now); +#else + std::tm timeinfo; + std::tm* timeinfo_p = &timeinfo; + ::localtime_r(&now, &timeinfo); +#endif + + tz_len = std::strftime(tz_offset, sizeof(tz_offset), "%z", timeinfo_p); + + if (tz_len < kTzOffsetLen && tz_len > 1) { + // Timezone offset was written. strftime writes offset as +HHMM or -HHMM, + // RFC3339 specifies an offset as +HH:MM or -HH:MM. To convert, we parse + // the offset as an integer, then reprint it to a string. + + offset_minutes = ::strtol(tz_offset, NULL, 10); + if (offset_minutes < 0) { + offset_minutes *= -1; + tz_offset_sign = '-'; + } + + tz_len = ::snprintf( + tz_offset, sizeof(tz_offset), "%c%02li:%02li", tz_offset_sign, offset_minutes / 100, offset_minutes % 100); + ((void)tz_len); // Prevent unused variable warning in optimized build. + } else { + // Unknown offset. RFC3339 specifies that unknown local offsets should be + // written as UTC time with -00:00 timezone. +#if defined(BENCHMARK_OS_WINDOWS) + // Potential race condition if another thread calls localtime or gmtime. + timeinfo_p = ::gmtime(&now); +#else + ::gmtime_r(&now, &timeinfo); +#endif + + strncpy(tz_offset, "-00:00", kTzOffsetLen + 1); + } + + timestamp_len = std::strftime(storage, sizeof(storage), "%Y-%m-%dT%H:%M:%S", timeinfo_p); + // Prevent unused variable warning in optimized build. + ((void)kTimestampLen); + + std::strncat(storage, tz_offset, sizeof(storage) - timestamp_len - 1); + return {storage}; +} + +class CPUInfo; +struct SystemInfo; +class BenchmarkReporter { +public: + struct Context { + CPUInfo const& cpu_info; + SystemInfo const& sys_info; + // The number of chars in the longest benchmark name. + size_t name_field_width; + + Context() + : cpu_info(CPUInfo::getInstance()) + , sys_info(SystemInfo::getInstance()) {} + }; + + class Run { + public: + static const int64_t no_repetition_index = -1; + enum RunType { RT_Iteration, RT_Aggregate }; + + explicit Run(int benchmark_number, std::string& name, uint64_t iterations, double cycles_per_tuple) + : benchmark_number(benchmark_number) + , name(name) + , iterations(iterations) + , cycles_per_tuple(cycles_per_tuple) + // time_unit(kNanosecond), + // real_accumulated_time(0), + // cpu_accumulated_time(0), + // max_heapbytes_used(0), + // complexity(oNone), + // complexity_lambda(), + // complexity_n(0), + // report_big_o(false), + // report_rms(false), + // counters(), + // has_memory_result(false), + // allocs_per_iter(0.0), + // max_bytes_used(0) + {} + + std::string benchmark_name() const; + int benchmark_number; + std::string name; + uint64_t iterations; + RunType run_type; + std::string aggregate_name; + bool error_occurred; + std::string error_message; + + // TimeUnit time_unit; + double cycles_per_tuple; + + // Memory metrics. + bool has_memory_result; + double allocs_per_iter; + int64_t max_bytes_used; + }; + + // Construct a BenchmarkReporter with the output stream set to 'std::cout' + // and the error stream set to 'std::cerr' + BenchmarkReporter() + : output_stream_(&std::cout) + , error_stream_(&std::cerr) {} + + // Called once for every suite of benchmarks run. + // The parameter "context" contains information that the + // reporter may wish to use when generating its report, for example the + // platform under which the benchmarks are running. The benchmark run is + // never started if this function returns false, allowing the reporter + // to skip runs based on the context information. + virtual bool ReportContext(const Context& context) = 0; + + // Called once for each group of benchmark runs, gives information about + // cpu-time and heap memory usage during the benchmark run. If the group + // of runs contained more than two entries then 'report' contains additional + // elements representing the mean and standard deviation of those runs. + // Additionally if this group of runs was the last in a family of benchmarks + // 'reports' contains additional entries representing the asymptotic + // complexity and RMS of that benchmark family. + virtual void ReportRuns(std::vector& report) = 0; + + // Called once and only once after ever group of benchmarks is run and + // reported. + virtual void Finalize() {} + + // REQUIRES: The object referenced by 'out' is valid for the lifetime + // of the reporter. + void SetOutputStream(std::ostream* out) { + assert(out); + output_stream_ = out; + } + + // REQUIRES: The object referenced by 'err' is valid for the lifetime + // of the reporter. + void SetErrorStream(std::ostream* err) { + assert(err); + error_stream_ = err; + } + + static std::ostream& GetOutputStream() { return std::cout; } + + static std::ostream& GetErrorStream() { return std::cerr; } + + virtual ~BenchmarkReporter(); + + // Write a human readable string to 'out' representing the specified + // 'context'. + // REQUIRES: 'out' is non-null. + static void PrintBasicContext(std::ostream& out, Context const& context) { + // CHECK(out) << "cannot be null"; + auto& Out = out; + + Out << LocalDateTimeString() << "\n"; + + const CPUInfo& info = context.cpu_info; + Out << "Run on (" << info.num_cpus << " X " << (info.cycles_per_second / 1000000.0) << " MHz CPU " + << ((info.num_cpus > 1) ? "s" : "") << ")\n"; + if (info.caches.size() != 0) { + Out << "CPU Caches:\n"; + for (auto& CInfo : info.caches) { + Out << " L" << CInfo.level << " " << CInfo.type << " " << (CInfo.size / 1024) << " KiB"; + if (CInfo.num_sharing != 0) { Out << " (x" << (info.num_cpus / CInfo.num_sharing) << ")"; } + Out << "\n"; + } + } + if (!info.load_avg.empty()) { + Out << "Load Average: "; + for (auto It = info.load_avg.begin(); It != info.load_avg.end();) { + Out << StrFormat("%.2f", *It++); + if (It != info.load_avg.end()) { Out << ", "; } + } + Out << "\n"; + } + + if (CPUInfo::Scaling::ENABLED == info.scaling) { + Out << "***WARNING*** CPU scaling is enabled, the benchmark " + "real time measurements may be noisy and will incur extra " + "overhead.\n"; + } + +#ifndef NDEBUG + Out << "***WARNING*** Library was built as DEBUG. Timings may be " + "affected.\n"; +#endif + } + +private: + std::ostream* output_stream_; + std::ostream* error_stream_; +}; + +// https://github.com/emscripten-core/emscripten/wiki/Emterpreter or use node.js +// https://stackoverflow.com/questions/32573289/text-written-to-stdout-doesnt-appear-until-program-completion +static void printRun(std::ostream& out, benchmark::BenchmarkReporter::Run& run) { + out << run.benchmark_number << ","; + out << run.name << ","; + out << run.iterations << ","; + out << run.cycles_per_tuple; + out << '\n'; +} + +FLS_BENCH_MAYBE_UNUSED static std::string CsvEscape(const std::string& s) { + std::string tmp; + tmp.reserve(s.size() + 2); + for (char c : s) { + switch (c) { + case '"': + tmp += "\"\""; + break; + default: + tmp += c; + break; + } + } + return '"' + tmp + '"'; +} + +static void printHeader(std::ostream& out) { + out << "benchmark_number,"; + out << "name,"; + out << "iterations,"; + out << "cycles_per_tuple"; + out << "\n"; +} + +class CSVReporter : public BenchmarkReporter { +public: + explicit CSVReporter(std::string path) + : path(std::move(path)) + , printed_header(false) {} + bool ReportContext(const Context& context) override { + PrintBasicContext(GetErrorStream(), context); + return true; + } + static void PrintContext() { PrintBasicContext(GetErrorStream(), benchmark::BenchmarkReporter::Context()); } + static void WriteRuns(std::vector& reports, const std::string& path) { +#ifdef BENCHMARK_OS_EMSCRIPTEN + // alternatives : + // https://stackoverflow.com/questions/67174663/cannot-save-the-file-to-specific-directory-by-wasm + std::cerr << "Modern web browsers do not allow web pages to write/open a local file in your machine."; +#else + std::fstream file; + file.open(path, std::fstream::out); + printHeader(file); + + // print results for each run + for (auto& run : reports) { + printRun(file, run); + } + + if (file.fail()) { + std::cerr << "Error: " << strerror(errno) << ": " << path << "\n"; + throw std::exception(); + } + + std::cout << "benchmark result has been writen at " << path << '\n'; +#endif + } + static void WriteContext(std::string& cmake_info, const std::string& path) { +#ifdef BENCHMARK_OS_EMSCRIPTEN + // alternatives : + // https://stackoverflow.com/questions/67174663/cannot-save-the-file-to-specific-directory-by-wasm + std::cerr << "Modern web browsers do not allow web pages to write/open a local file in your machine."; +#else + std::fstream file; + file.open(path, std::fstream::out); + PrintBasicContext(file, benchmark::BenchmarkReporter::Context()); + file << cmake_info; + if (file.fail()) { + std::cerr << "Error: " << strerror(errno) << ": " << path << "\n"; + throw std::exception(); + } + + std::cout << "benchmark metadata has been writen at " << path << '\n'; +#endif + } + static void PrintRun(Run& run) { + std::ostream& out = GetOutputStream(); + printRun(out, run); + } + +public: + const std::string path; + bool printed_header; + std::set user_counter_names; +}; + +using report = benchmark::BenchmarkReporter::Run; + +class Benchmark { +private: + explicit Benchmark(std::string name) + : m_name(std::move(name)) + , m_cpu_info(CPUInfo::getInstance()) + , m_system_info(SystemInfo::getInstance()) + , m_enable_print(false) + , m_result_file {' '} + , m_metadata_file {' '} {} // +public: + friend class BenchmarkBuilder; + void Run(benchmark::BenchmarkReporter::Run run) { + if (m_enable_print) { benchmark::CSVReporter::PrintRun(run); } + + m_runs.push_back(run); + } + ~Benchmark() { + if (m_enable_save && !m_runs.empty()) { + CSVReporter::WriteRuns(m_runs, m_result_file); + CSVReporter::WriteContext(m_extra_info, m_metadata_file); + } + } // +private: + std::string m_name; + const CPUInfo& m_cpu_info; + const SystemInfo& m_system_info; + std::string m_extra_info; + bool m_enable_save; + bool m_enable_print; + std::string m_result_file; + std::string m_metadata_file; + std::string m_dir; + std::vector m_runs; +}; + +class CmakeInfo; +constexpr auto METADATA_PREFIX {".metadata"}; +constexpr auto CSV_PREFIX {".csv"}; + +class BenchmarkBuilder { +public: + explicit BenchmarkBuilder(std::string name) + : m_benchmark(std::move(name)) {} // +public: + operator Benchmark() const { return m_benchmark; } + benchmark::BenchmarkBuilder& save() { + m_benchmark.m_metadata_file = "./" + m_benchmark.m_metadata_file + METADATA_PREFIX; + m_benchmark.m_result_file = "./" + m_benchmark.m_metadata_file + CSV_PREFIX; + m_benchmark.m_enable_print = true; + return *this; + } + benchmark::BenchmarkBuilder& print() { + m_benchmark.m_enable_save = true; + return *this; + } + benchmark::BenchmarkBuilder& at(const std::string& dir) { + m_benchmark.m_metadata_file = dir + "/" + m_benchmark.m_name + METADATA_PREFIX; + m_benchmark.m_result_file = dir + "/" + m_benchmark.m_name + CSV_PREFIX; + return *this; + } + benchmark::BenchmarkBuilder& add_extra_info(const std::string& info) { + m_benchmark.m_extra_info += info; + return *this; + } + +private: + Benchmark m_benchmark; // +}; + +FLS_BENCH_MAYBE_UNUSED static BenchmarkBuilder create(const std::string& name) { + cycleclock::Init(); + return BenchmarkBuilder(name); +} + +class CmakeInfo { +public: + static const CmakeInfo& getInstance() { + static const CmakeInfo INFO; + return INFO; + } + +public: + const std::string source_dir = SOURCE_DIR; + const std::string cmake_osx_architectures = CMAKE_OSX_ARCHITECTURES; + const std::string cmake_host_system_processor = CMAKE_HOST_SYSTEM_PROCESSOR; + const std::string cmake_system_processor = CMAKE_SYSTEM_PROCESSOR; + const std::string cmake_host_system_name = CMAKE_HOST_SYSTEM_NAME; + const std::string cmake_system_name = CMAKE_SYSTEM_NAME; + const std::string cmake_c_compiler = CMAKE_C_COMPILER; + const std::string cmake_cxx_compiler = CMAKE_CXX_COMPILER; + const std::string cmake_cxx_compiler_id = CMAKE_CXX_COMPILER_ID; + const std::string cmake_cxx_compiler_version = CMAKE_CXX_COMPILER_VERSION; + const std::string cmake_crosscompiling = CMAKE_CROSSCOMPILING; + const std::string cmake_cxx_flags_debug = CMAKE_CXX_FLAGS_DEBUG; + const std::string cmake_cxx_flags_release = CMAKE_CXX_FLAGS_RELEASE; + const std::string cmake_build_type = CMAKE_BUILD_TYPE; + const std::string cmake_toolchain_file = CMAKE_TOOLCHAIN_FILE; + const std::string target_name = TARGET_NAME; + const std::string target_compile_options = TARGET_COMPILE_OPTIONS; + +public: + const std::string& getSourceDir() const { return source_dir; } + const std::string& getCmakeOsxArchitectures() const { return cmake_osx_architectures; } + const std::string& getCmakeHostSystemProcessor() const { return cmake_host_system_processor; } + const std::string& getCmakeSystemProcessor() const { return cmake_system_processor; } + const std::string& getCmakeHostSystemName() const { return cmake_host_system_name; } + const std::string& getCmakeSystemName() const { return cmake_system_name; } + const std::string& getCmakeCCompiler() const { return cmake_c_compiler; } + const std::string& getCmakeCxxCompiler() const { return cmake_cxx_compiler; } + const std::string& getCmakeCxxCompilerId() const { return cmake_cxx_compiler_id; } + const std::string& getCmakeCxxCompilerVersion() const { return cmake_cxx_compiler_version; } + const std::string& getCmakeCrosscompiling() const { return cmake_crosscompiling; } + const std::string& getCmakeCxxFlagsDebug() const { return cmake_cxx_flags_debug; } + const std::string& getCmakeCxxFlagsRelease() const { return cmake_cxx_flags_release; } + const std::string& getCmakeBuildType() const { return cmake_build_type; } + const std::string& get_cmakeToolchainFile() const { return cmake_toolchain_file; } + const std::string& getTargetName() const { return target_name; } + const std::string& getTargetCompileOptions() const { return target_compile_options; } + + static void PrintCmake() { printCmakeInfo(std::cout); } + static void AppendCmake(const std::string& path) { + std::fstream file; + file.open(path, std::fstream::app); + printCmakeInfo(file); + + if (file.fail()) { + std::cerr << "Error: " << strerror(errno) << "\n"; + throw std::exception(); + } + + std::cout << "result has been writen at " + path; + } + static void printCmakeInfo(std::ostream& out) { + const CmakeInfo& info = getInstance(); + out << info.getCmakeInfo(); + } + + // https://stackoverflow.com/a/46931770/5165633 + static std::vector split(const std::string& s, char delim) { + std::vector result; + std::stringstream ss(s); + std::string item; + + while (getline(ss, item, delim)) { + result.push_back(item); + } + + return result; + } + + static std::string getCmakeToolchainFile() { + const CmakeInfo& info = getInstance(); + std::vector v = split(info.get_cmakeToolchainFile(), '/'); + + auto tool_chain_file_str = v[v.size() - 1]; + return tool_chain_file_str.substr(0, tool_chain_file_str.size() - 6); + } + + static std::string getCmakeInfo() { + std::ostringstream out; + const CmakeInfo& info = getInstance(); + out << "cmake info: \n"; + out << " source_dir: " << info.getSourceDir() << '\n'; + out << " cmake_osx_architectures" << info.getCmakeOsxArchitectures() << '\n'; + out << " cmake_host_system_processor: " << info.getCmakeHostSystemProcessor() << '\n'; + out << " cmake_system_processor: " << info.getCmakeSystemProcessor() << '\n'; + out << " cmake_host_system_name: " << info.getCmakeHostSystemName() << '\n'; + out << " cmake_system_name: " << info.getCmakeSystemName() << '\n'; + out << " cmake_c_compiler: " << info.getCmakeCCompiler() << '\n'; + out << " cmake_cxx_compiler: " << info.getCmakeCxxCompiler() << '\n'; + out << " cmake_cxx_compiler_id: " << info.getCmakeCxxCompilerId() << '\n'; + out << " cmake_cxx_compiler_version: " << info.getCmakeCxxCompilerVersion() << '\n'; + out << " cmake_crosscompiling: " << info.getCmakeCrosscompiling() << '\n'; + out << " cmake_cxx_flags_debug: " << info.getCmakeCxxFlagsDebug() << '\n'; + out << " cmake_cxx_flags_release: " << info.getCmakeCxxFlagsRelease() << '\n'; + out << " cmake_build_type: " << info.getCmakeBuildType() << '\n'; + out << " cmake_toolchain_file: " << info.getCmakeToolchainFile() << '\n'; + out << "target info: \n"; + out << " target_name: " << info.getTargetName() << '\n'; + out << " target_compile_options: " << info.getTargetCompileOptions() << '\n'; + return out.str(); + } + +private: + CmakeInfo() = default; // + BENCHMARK_DISALLOW_COPY_AND_ASSIGN(CmakeInfo); // +}; +} // namespace benchmark +#endif diff --git a/benchmarks/fls_bench/google/benchmark/LICENSE b/benchmarks/fls_bench/google/benchmark/LICENSE new file mode 100644 index 0000000..7a4a3ea --- /dev/null +++ b/benchmarks/fls_bench/google/benchmark/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. \ No newline at end of file diff --git a/benchmarks/include/alp_result.hpp b/benchmarks/include/alp_result.hpp new file mode 100644 index 0000000..f30436e --- /dev/null +++ b/benchmarks/include/alp_result.hpp @@ -0,0 +1,49 @@ +/* +-- DATE : 17/04/2024 +-- FILE_PATH : benchmarks/bench_full_dataset/result.hpp +-- PROJECT_NAME : ALP +*/ + +#ifndef BENCHMARKS_RESULT_HPP +#define BENCHMARKS_RESULT_HPP + +#include +#include +#include +#include + +namespace alp_bench { + +struct VectorMetadata { + uint8_t bit_width {0}; + uint16_t exceptions_count {0}; + uint64_t unq_c {0}; + uint16_t freq {0}; + double size {0}; + uint64_t right_bit_width {0}; + uint64_t left_bit_width {0}; + std::vector> repetition_vec; +}; + +inline std::string to_str(double val) { + std::stringstream stream; + stream << std::fixed << std::setprecision(2) << val; + std::string str = stream.str(); + return str; +} + +inline std::unordered_map results = { + // + {"Air-Pressure", "16.43"}, {"Arade/4", "24.94"}, {"Basel-Temp", "30.72"}, {"Basel-Wind", "29.81"}, + {"Bird-Mig", "20.14"}, {"Btc-Price", "26.37"}, {"Blockchain", "36.49"}, {"City-Temp", "10.74"}, + {"CMS/1", "35.65"}, {"CMS/9", "11.67"}, {"CMS/25", "41.11"}, {"Dew-Temp", "13.40"}, + {"Bio-Temp", "10.75"}, {"Food-prices", "23.65"}, {"Gov/10", "30.99"}, {"Gov/26", "0.41"}, + {"Gov/30", "7.48"}, {"Gov/31", "3.05"}, {"Gov/40", "0.83"}, {"Medicare/1", "39.35"}, + {"Medicare/9", "12.26"}, {"PM10-dust", "8.56"}, {"NYC/29", "40.38"}, {"SD-bench", "16.21"}, + {"Stocks-DE", "11.01"}, {"Stocks-UK", "12.59"}, {"Stocks-USA", "7.90"}, {"Wind-dir", "15.89"}, + // +}; + +} // namespace alp_bench + +#endif // BENCHMARKS_RESULT_HPP diff --git a/benchmarks/include/chimp/bit_reader.hpp b/benchmarks/include/chimp/bit_reader.hpp new file mode 100644 index 0000000..2c82a5d --- /dev/null +++ b/benchmarks/include/chimp/bit_reader.hpp @@ -0,0 +1,152 @@ +#pragma once + +#include "duckdb/duckdb.h" + +namespace alp_bench { + +//! Every byte read touches at most 2 bytes (1 if it's perfectly aligned) +//! Within a byte we need to mask off the bits that we're interested in + +struct BitReader { +private: + //! Align the masks to the right + static constexpr uint8_t MASKS[] = { + 0, // 0b00000000, + 128, // 0b10000000, + 192, // 0b11000000, + 224, // 0b11100000, + 240, // 0b11110000, + 248, // 0b11111000, + 252, // 0b11111100, + 254, // 0b11111110, + 255, // 0b11111111, + // These later masks are for the cases where index + SIZE exceeds 8 + 254, // 0b11111110, + 252, // 0b11111100, + 248, // 0b11111000, + 240, // 0b11110000, + 224, // 0b11100000, + 192, // 0b11000000, + 128, // 0b10000000, + }; + + static constexpr uint8_t REMAINDER_MASKS[] = { + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 128, // 0b10000000, + 192, // 0b11000000, + 224, // 0b11100000, + 240, // 0b11110000, + 248, // 0b11111000, + 252, // 0b11111100, + 254, // 0b11111110, + 255, // 0b11111111, + }; + +public: +public: + BitReader() + : input(nullptr) + , index(0) {} + uint8_t* input; + uint32_t index; + +public: + void SetStream(uint8_t* input) { + this->input = input; + index = 0; + } + + inline uint8_t BitIndex() const { return (index & 7); } + inline uint64_t ByteIndex() const { return (index >> 3); } + + inline uint8_t InnerReadByte(const uint8_t& offset) { + uint8_t result = input[ByteIndex() + offset] << BitIndex() | + ((input[ByteIndex() + offset + 1] & REMAINDER_MASKS[8 + BitIndex()]) >> (8 - BitIndex())); + return result; + } + + //! index: 4 + //! size: 7 + //! input: [12345678][12345678] + //! result: [-AAAA BBB] + //! + //! Result contains 4 bits from the first byte (making up the most significant bits) + //! And 3 bits from the second byte (the least significant bits) + inline uint8_t InnerRead(const uint8_t& size, const uint8_t& offset) { + const uint8_t right_shift = 8 - size; + const uint8_t bit_remainder = (8 - ((size + BitIndex()) - 8)) & 7; + // The least significant bits are positioned at the far right of the byte + + // Create a mask given the size and index + // Take the first byte + // Left-shift it by index, to line up the bits we're interested in with the mask + // Get the mask for the given size + // Bit-wise AND the byte and the mask together + // Right-shift this result (the most significant bits) + + // Sometimes we will need to read from the second byte + // But to make this branchless, we will perform what is basically a no-op if this condition is not true + // SPILL = (index + size >= 8) + // + // If SPILL is true: + // The REMAINDER_MASKS gives us the mask for the bits we're interested in + // We bit-wise AND these together (no need to shift anything because the index is essentially zero for this new + // byte) And we then right-shift these bits in place (to the right of the previous bits) + const bool spill_to_next_byte = (size + BitIndex() >= 8); + uint8_t result = + ((input[ByteIndex() + offset] << BitIndex()) & MASKS[size]) >> right_shift | + ((input[ByteIndex() + offset + spill_to_next_byte] & REMAINDER_MASKS[size + BitIndex()]) >> bit_remainder); + return result; + } + + template + inline T ReadBytes(const uint8_t& remainder) { + T result = 0; + if (BYTES > 0) { result = result << 8 | InnerReadByte(0); } + if (BYTES > 1) { result = result << 8 | InnerReadByte(1); } + if (BYTES > 2) { result = result << 8 | InnerReadByte(2); } + if (BYTES > 3) { result = result << 8 | InnerReadByte(3); } + if (BYTES > 4) { result = result << 8 | InnerReadByte(4); } + if (BYTES > 5) { result = result << 8 | InnerReadByte(5); } + if (BYTES > 6) { result = result << 8 | InnerReadByte(6); } + if (BYTES > 7) { result = result << 8 | InnerReadByte(7); } + result = result << remainder | InnerRead(remainder, BYTES); + index += (BYTES << 3) + remainder; + return result; + } + + template + inline T ReadBytes(const uint8_t& bytes, const uint8_t& remainder) { + T result = 0; + for (uint8_t i = 0; i < bytes; i++) { + result = result << 8 | InnerReadByte(i); + } + result = result << remainder | InnerRead(remainder, bytes); + index += (bytes << 3) + remainder; + return result; + } + + template + inline T ReadValue() { + constexpr uint8_t BYTES = (SIZE >> 3); + constexpr uint8_t REMAINDER = (SIZE & 7); + return ReadBytes(REMAINDER); + } + + template + inline T ReadValue(const uint8_t& size) { + const uint8_t bytes = size >> 3; // divide by 8; + const uint8_t remainder = size & 7; + return ReadBytes(bytes, remainder); + } +}; + +} // namespace alp_bench diff --git a/benchmarks/include/chimp/bit_utils.hpp b/benchmarks/include/chimp/bit_utils.hpp new file mode 100644 index 0000000..f8f82f8 --- /dev/null +++ b/benchmarks/include/chimp/bit_utils.hpp @@ -0,0 +1,20 @@ +//===----------------------------------------------------------------------===// +// DuckDB +// +// lwcbench/compression/chimp/algorithm/bit_utils.hpp +// +// +//===----------------------------------------------------------------------===// + +#pragma once + +namespace alp_bench { + +template +struct BitUtils { + static constexpr R Mask(unsigned int const bits) { + return (((uint64_t)(bits < (sizeof(R) * 8))) << (bits & ((sizeof(R) * 8) - 1))) - 1U; + } +}; + +} // namespace alp_bench diff --git a/benchmarks/include/chimp/byte_reader.hpp b/benchmarks/include/chimp/byte_reader.hpp new file mode 100644 index 0000000..3cf7c46 --- /dev/null +++ b/benchmarks/include/chimp/byte_reader.hpp @@ -0,0 +1,121 @@ +//===----------------------------------------------------------------------===// +// DuckDB +// +// lwcbench/compression/chimp/algorithm/byte_reader.hpp +// +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include "duckdb/duckdb.h" +#include "duckdb/exception.hpp" +#include "duckdb/fast_mem.hpp" + +namespace alp_bench { + +class ByteReader { +public: + ByteReader() + : buffer(nullptr) + , index(0) {} + +public: + void SetStream(const uint8_t* buffer) { + this->buffer = buffer; + index = 0; + } + + size_t Index() const { return index; } + + template + T ReadValue() { + auto result = Load(buffer + index); + index += sizeof(T); + return result; + } + + template + T ReadValue() { + return ReadValue(SIZE); + } + + template + inline T ReadValue(uint8_t bytes, uint8_t trailing_zero) { + T result = 0; + switch (bytes) { + case 1: + result = Load(buffer + index); + index++; + return result; + case 2: + result = Load(buffer + index); + index += 2; + return result; + case 3: + memcpy(&result, (void*)(buffer + index), 3); + index += 3; + return result; + case 4: + result = Load(buffer + index); + index += 4; + return result; + case 5: + memcpy(&result, (void*)(buffer + index), 5); + index += 5; + return result; + case 6: + memcpy(&result, (void*)(buffer + index), 6); + index += 6; + return result; + case 7: + memcpy(&result, (void*)(buffer + index), 7); + index += 7; + return result; + default: + if (trailing_zero < 8) { + result = Load(buffer + index); + index += sizeof(T); + return result; + } + return result; + } + } + +private: + const uint8_t* buffer; + uint32_t index; +}; + +template <> +inline uint32_t ByteReader::ReadValue(uint8_t bytes, uint8_t trailing_zero) { + uint32_t result = 0; + switch (bytes) { + case 0: + if (trailing_zero < 8) { + result = Load(buffer + index); + index += sizeof(uint32_t); + return result; + } + return result; + case 1: + result = Load(buffer + index); + index++; + return result; + case 2: + result = Load(buffer + index); + index += 2; + return result; + case 3: + memcpy(&result, (void*)(buffer + index), 3); + index += 3; + return result; + case 4: + result = Load(buffer + index); + index += 4; + return result; + default: + throw InternalException("Write of %llu bytes attempted into address pointing to 4 byte value", bytes); + } +} +} // namespace alp_bench diff --git a/benchmarks/include/chimp/byte_writer.hpp b/benchmarks/include/chimp/byte_writer.hpp new file mode 100644 index 0000000..149c66f --- /dev/null +++ b/benchmarks/include/chimp/byte_writer.hpp @@ -0,0 +1,56 @@ +//===----------------------------------------------------------------------===// +// DuckDB +// +// lwcbench/compression/chimp/algorithm/byte_writer.hpp +// +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include "duckdb/duckdb.h" +#include "duckdb/helper.hpp" +#include + +namespace alp_bench { + +template +class ByteWriter { +public: + ByteWriter() + : buffer(nullptr) + , index(0) {} + +public: + idx_t BytesWritten() const { return index; } + + void Flush() {} + + void ByteAlign() {} + + void SetStream(uint8_t* buffer) { + this->buffer = buffer; + this->index = 0; + } + + template + void WriteValue(const T& value) { + const uint8_t bytes = (SIZE >> 3) + ((SIZE & 7) != 0); + if (!EMPTY) { memcpy((void*)(buffer + index), &value, bytes); } + index += bytes; + } + + template + void WriteValue(const T& value, const uint8_t& size) { + const uint8_t bytes = (size >> 3) + ((size & 7) != 0); + if (!EMPTY) { memcpy((void*)(buffer + index), &value, bytes); } + index += bytes; + } + +private: +private: + uint8_t* buffer; + idx_t index; +}; + +} // namespace alp_bench diff --git a/benchmarks/include/chimp/chimp.hpp b/benchmarks/include/chimp/chimp.hpp new file mode 100644 index 0000000..9487d15 --- /dev/null +++ b/benchmarks/include/chimp/chimp.hpp @@ -0,0 +1,242 @@ +#pragma once + +#include "bit_reader.hpp" +#include "chimp_utils.hpp" +#include "duckdb/duckdb.h" +#include "duckdb/exception.hpp" +#include "duckdb/fast_mem.hpp" +#include "duckdb/likely.hpp" +#include "duckdb/limits.hpp" +#include "flag_buffer.hpp" +#include "leading_zero_buffer.hpp" +#include "output_bit_stream.hpp" + +namespace alp_bench { + +//===--------------------------------------------------------------------===// +// Compression +//===--------------------------------------------------------------------===// + +template +struct ChimpCompressionState { + + ChimpCompressionState() + : previous_leading_zeros(NumericLimits::Maximum()) { + previous_value = 0; + } + + inline void SetLeadingZeros(int32_t value = NumericLimits::Maximum()) { + this->previous_leading_zeros = value; + } + + void Flush() { leading_zero_buffer.Flush(); } + + // Reset the state + void Reset() { + first = true; + SetLeadingZeros(); + leading_zero_buffer.Reset(); + flag_buffer.Reset(); + previous_value = 0; + } + + CHIMP_TYPE BitsWritten() const { + return output.BitsWritten() + leading_zero_buffer.BitsWritten() + flag_buffer.BitsWritten(); + } + + OutputBitStream output; // The stream to write to + LeadingZeroBuffer leading_zero_buffer; + FlagBuffer flag_buffer; + uint8_t previous_leading_zeros; //! The leading zeros of the reference value + CHIMP_TYPE previous_value = 0; + bool first = true; +}; + +template +class ChimpCompression { +public: + using State = ChimpCompressionState; + + //! The amount of bits needed to store an index between 0-127 + static constexpr uint8_t INDEX_BITS_SIZE = 7; + static constexpr uint8_t SIGNIFICANT_BITS_SIZE = + 6; // The amount needed to store the maximum number of significant bits (0-63) + static constexpr uint8_t BIT_SIZE = sizeof(CHIMP_TYPE) * 8; + + static constexpr uint8_t TRAILING_ZERO_THRESHOLD = SignificantBits::size + INDEX_BITS_SIZE; + + static void Store(CHIMP_TYPE in, State& state) { + if (state.first) { + WriteFirst(in, state); + } else { + CompressValue(in, state); + } + } + + //! Write the content of the bit buffer to the stream + static void Flush(State& state) { + if (!EMPTY) { state.output.Flush(); } + } + + static void WriteFirst(CHIMP_TYPE in, State& state) { + state.output.template WriteValue(in); + state.previous_value = in; + state.first = false; + } + + static void CompressValue(CHIMP_TYPE in, State& state) { + + CHIMP_TYPE xor_result; + uint8_t previous_index; + uint32_t trailing_zeros = 0; + bool trailing_zeros_exceed_threshold = false; + + xor_result = (CHIMP_TYPE)in ^ state.previous_value; + + // Compress the value + if (xor_result == 0) { + state.flag_buffer.Insert(ChimpConstants::Flags::VALUE_IDENTICAL); + // state.output.template WriteValue(previous_index); + state.SetLeadingZeros(); + } else { + // Values are not identical + auto leading_zeros_raw = CountZeros::Leading(xor_result); + uint8_t leading_zeros = ChimpConstants::Compression::LEADING_ROUND[leading_zeros_raw]; + + trailing_zeros = CountZeros::Trailing(xor_result); + trailing_zeros_exceed_threshold = trailing_zeros > 6; + + if (trailing_zeros_exceed_threshold) { + uint32_t significant_bits = BIT_SIZE - leading_zeros - trailing_zeros; + state.flag_buffer.Insert(ChimpConstants::Flags::TRAILING_EXCEEDS_THRESHOLD); + state.leading_zero_buffer.Insert(ChimpConstants::Compression::LEADING_REPRESENTATION[leading_zeros]); + state.output.template WriteValue(significant_bits); + state.output.template WriteValue(xor_result >> trailing_zeros, significant_bits); + state.SetLeadingZeros(); + } else if (leading_zeros == state.previous_leading_zeros) { + state.flag_buffer.Insert(ChimpConstants::Flags::LEADING_ZERO_EQUALITY); + int32_t significant_bits = BIT_SIZE - leading_zeros; + state.output.template WriteValue(xor_result, significant_bits); + } else { + state.flag_buffer.Insert(ChimpConstants::Flags::LEADING_ZERO_LOAD); + const int32_t significant_bits = BIT_SIZE - leading_zeros; + state.leading_zero_buffer.Insert(ChimpConstants::Compression::LEADING_REPRESENTATION[leading_zeros]); + state.output.template WriteValue(xor_result, significant_bits); + state.SetLeadingZeros(leading_zeros); + } + } + state.previous_value = in; + } +}; + +//===--------------------------------------------------------------------===// +// Decompression +//===--------------------------------------------------------------------===// + +template +struct ChimpDecompressionState { +public: + ChimpDecompressionState() + : reference_value(0) + , first(true) { + ResetZeros(); + } + + void Reset() { + ResetZeros(); + reference_value = 0; + first = true; + } + + inline void ResetZeros() { + leading_zeros = NumericLimits::Maximum(); + trailing_zeros = 0; + } + + inline void SetLeadingZeros(uint8_t value) { leading_zeros = value; } + + inline void SetTrailingZeros(uint8_t value) { + D_ASSERT(value <= sizeof(CHIMP_TYPE) * 8); + trailing_zeros = value; + } + + uint8_t LeadingZeros() const { return leading_zeros; } + uint8_t TrailingZeros() const { return trailing_zeros; } + + BitReader input; + uint8_t leading_zeros; + uint8_t trailing_zeros; + CHIMP_TYPE reference_value = 0; + + bool first; +}; + +template +struct ChimpDecompression { +public: + using DecompressState = ChimpDecompressionState; + + static constexpr uint8_t INDEX_BITS_SIZE = 7; + static constexpr uint8_t BIT_SIZE = sizeof(CHIMP_TYPE) * 8; + static constexpr uint8_t SIGNIFICANT_BITS_SIZE = 6; + + static inline CHIMP_TYPE + Load(ChimpConstants::Flags flag, uint8_t leading_zeros[], uint32_t& leading_zero_index, DecompressState& state) { + if (DUCKDB_UNLIKELY(state.first)) { + return LoadFirst(state); + } else { + return DecompressValue(flag, leading_zeros, leading_zero_index, state); + } + } + + static inline CHIMP_TYPE LoadFirst(DecompressState& state) { + CHIMP_TYPE result = state.input.template ReadValue(); + state.first = false; + state.reference_value = result; + return result; + } + + static inline CHIMP_TYPE DecompressValue(ChimpConstants::Flags flag, + uint8_t leading_zeros[], + uint32_t& leading_zero_index, + DecompressState& state) { + CHIMP_TYPE result; + switch (flag) { + case ChimpConstants::Flags::VALUE_IDENTICAL: { + //! Value is identical to previous value + result = state.reference_value; + break; + } + case ChimpConstants::Flags::TRAILING_EXCEEDS_THRESHOLD: { + state.leading_zeros = leading_zeros[leading_zero_index++]; + auto significant_bits = state.input.template ReadValue(SIGNIFICANT_BITS_SIZE); + state.trailing_zeros = BIT_SIZE - significant_bits - state.leading_zeros; + result = state.input.template ReadValue(significant_bits); + result <<= state.trailing_zeros; + result ^= state.reference_value; + break; + } + case ChimpConstants::Flags::LEADING_ZERO_EQUALITY: { + result = state.input.template ReadValue(BIT_SIZE - state.leading_zeros); + result ^= state.reference_value; + break; + } + case ChimpConstants::Flags::LEADING_ZERO_LOAD: { + state.leading_zeros = leading_zeros[leading_zero_index++]; + D_ASSERT(state.leading_zeros <= BIT_SIZE); + result = state.input.template ReadValue(BIT_SIZE - state.leading_zeros); + result ^= state.reference_value; + break; + } + default: + // std::cout << "Chimp compression flag with value not recognized "; + // std::cout << flag; + break; + // throw InternalException("Chimp compression flag with value %d not recognized", flag); + } + state.reference_value = result; + return result; + } +}; + +} // namespace alp_bench diff --git a/benchmarks/include/chimp/chimp128.hpp b/benchmarks/include/chimp/chimp128.hpp new file mode 100644 index 0000000..3047ad6 --- /dev/null +++ b/benchmarks/include/chimp/chimp128.hpp @@ -0,0 +1,294 @@ +//===----------------------------------------------------------------------===// +// DuckDB +// +// lwcbench/compression/chimp/algorithm/chimp128.hpp +// +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include "bit_reader.hpp" +#include "chimp_utils.hpp" +#include "duckdb/duckdb.h" +#include "duckdb/exception.hpp" +#include "duckdb/fast_mem.hpp" +#include "duckdb/likely.hpp" +#include "duckdb/limits.hpp" +#include "flag_buffer.hpp" +#include "leading_zero_buffer.hpp" +#include "output_bit_stream.hpp" +#include "packed_data.hpp" +#include "ring_buffer.hpp" + +namespace alp_bench { + +//===--------------------------------------------------------------------===// +// Compression +//===--------------------------------------------------------------------===// + +template +struct Chimp128CompressionState { + + Chimp128CompressionState() + : ring_buffer() + , previous_leading_zeros(NumericLimits::Maximum()) { + previous_value = 0; + } + + inline void SetLeadingZeros(int32_t value = NumericLimits::Maximum()) { + this->previous_leading_zeros = value; + } + + void Flush() { leading_zero_buffer.Flush(); } + + // Reset the state + void Reset() { + first = true; + ring_buffer.Reset(); + SetLeadingZeros(); + leading_zero_buffer.Reset(); + flag_buffer.Reset(); + packed_data_buffer.Reset(); + previous_value = 0; + } + + CHIMP_TYPE BitsWritten() const { + return output.BitsWritten() + leading_zero_buffer.BitsWritten() + flag_buffer.BitsWritten() + + (packed_data_buffer.index * 16); + } + + OutputBitStream output; // The stream to write to + LeadingZeroBuffer leading_zero_buffer; + FlagBuffer flag_buffer; + PackedDataBuffer packed_data_buffer; + RingBuffer ring_buffer; //! The ring buffer that holds the previous values + uint8_t previous_leading_zeros; //! The leading zeros of the reference value + CHIMP_TYPE previous_value = 0; + bool first = true; +}; + +template +class Chimp128Compression { +public: + using State = Chimp128CompressionState; + + //! The amount of bits needed to store an index between 0-127 + static constexpr uint8_t INDEX_BITS_SIZE = 7; + static constexpr uint8_t BIT_SIZE = sizeof(CHIMP_TYPE) * 8; + + static constexpr uint8_t TRAILING_ZERO_THRESHOLD = SignificantBits::size + INDEX_BITS_SIZE; + + static void Store(CHIMP_TYPE in, State& state) { + if (state.first) { + WriteFirst(in, state); + } else { + CompressValue(in, state); + } + } + + //! Write the content of the bit buffer to the stream + static void Flush(State& state) { + if (!EMPTY) { state.output.Flush(); } + } + + static void WriteFirst(CHIMP_TYPE in, State& state) { + state.ring_buffer.template Insert(in); + state.output.template WriteValue(in); + state.previous_value = in; + state.first = false; + } + + static void CompressValue(CHIMP_TYPE in, State& state) { + + auto key = state.ring_buffer.Key(in); + CHIMP_TYPE xor_result; + uint8_t previous_index; + uint32_t trailing_zeros = 0; + bool trailing_zeros_exceed_threshold = false; + const CHIMP_TYPE reference_index = state.ring_buffer.IndexOf(key); + + // Find the reference value to use when compressing the current value + if (((int64_t)state.ring_buffer.Size() - (int64_t)reference_index) < (int64_t)ChimpConstants::BUFFER_SIZE) { + // The reference index is within 128 values, we can use it + auto current_index = state.ring_buffer.IndexOf(key); + if (current_index > state.ring_buffer.Size()) { current_index = 0; } + auto reference_value = state.ring_buffer.Value(current_index % ChimpConstants::BUFFER_SIZE); + CHIMP_TYPE tempxor_result = (CHIMP_TYPE)in ^ reference_value; + trailing_zeros = CountZeros::Trailing(tempxor_result); + trailing_zeros_exceed_threshold = trailing_zeros > TRAILING_ZERO_THRESHOLD; + if (trailing_zeros_exceed_threshold) { + previous_index = current_index % ChimpConstants::BUFFER_SIZE; + xor_result = tempxor_result; + } else { + previous_index = state.ring_buffer.Size() % ChimpConstants::BUFFER_SIZE; + xor_result = (CHIMP_TYPE)in ^ state.ring_buffer.Value(previous_index); + } + } else { + // Reference index is not in range, use the directly previous value + previous_index = state.ring_buffer.Size() % ChimpConstants::BUFFER_SIZE; + xor_result = (CHIMP_TYPE)in ^ state.ring_buffer.Value(previous_index); + } + + // Compress the value + if (xor_result == 0) { + state.flag_buffer.Insert(ChimpConstants::Flags::VALUE_IDENTICAL); + state.output.template WriteValue(previous_index); + state.SetLeadingZeros(); + } else { + // Values are not identical + auto leading_zeros_raw = CountZeros::Leading(xor_result); + uint8_t leading_zeros = ChimpConstants::Compression::LEADING_ROUND[leading_zeros_raw]; + + if (trailing_zeros_exceed_threshold) { + state.flag_buffer.Insert(ChimpConstants::Flags::TRAILING_EXCEEDS_THRESHOLD); + uint32_t significant_bits = BIT_SIZE - leading_zeros - trailing_zeros; + auto result = PackedDataUtils::Pack( + reference_index, + ChimpConstants::Compression::LEADING_REPRESENTATION[leading_zeros], + significant_bits); + state.packed_data_buffer.Insert(result & 0xFFFF); + state.output.template WriteValue(xor_result >> trailing_zeros, significant_bits); + state.SetLeadingZeros(); + } else if (leading_zeros == state.previous_leading_zeros) { + state.flag_buffer.Insert(ChimpConstants::Flags::LEADING_ZERO_EQUALITY); + int32_t significant_bits = BIT_SIZE - leading_zeros; + state.output.template WriteValue(xor_result, significant_bits); + } else { + state.flag_buffer.Insert(ChimpConstants::Flags::LEADING_ZERO_LOAD); + const int32_t significant_bits = BIT_SIZE - leading_zeros; + state.leading_zero_buffer.Insert(ChimpConstants::Compression::LEADING_REPRESENTATION[leading_zeros]); + state.output.template WriteValue(xor_result, significant_bits); + state.SetLeadingZeros(leading_zeros); + } + } + state.previous_value = in; + state.ring_buffer.Insert(in); + } +}; + +//===--------------------------------------------------------------------===// +// Decompression +//===--------------------------------------------------------------------===// + +template +struct Chimp128DecompressionState { +public: + Chimp128DecompressionState() + : reference_value(0) + , first(true) { + ResetZeros(); + } + + void Reset() { + ResetZeros(); + reference_value = 0; + ring_buffer.Reset(); + first = true; + } + + inline void ResetZeros() { + leading_zeros = NumericLimits::Maximum(); + trailing_zeros = 0; + } + + inline void SetLeadingZeros(uint8_t value) { leading_zeros = value; } + + inline void SetTrailingZeros(uint8_t value) { + D_ASSERT(value <= sizeof(CHIMP_TYPE) * 8); + trailing_zeros = value; + } + + uint8_t LeadingZeros() const { return leading_zeros; } + uint8_t TrailingZeros() const { return trailing_zeros; } + + BitReader input; + uint8_t leading_zeros; + uint8_t trailing_zeros; + CHIMP_TYPE reference_value = 0; + RingBuffer ring_buffer; + + bool first; +}; + +template +struct Chimp128Decompression { +public: + using DecompressState = Chimp128DecompressionState; + + static constexpr uint8_t INDEX_BITS_SIZE = 7; + static constexpr uint8_t BIT_SIZE = sizeof(CHIMP_TYPE) * 8; + + static inline void UnpackPackedData(uint16_t packed_data, UnpackedData& dest) { + return PackedDataUtils::Unpack(packed_data, dest); + } + + static inline CHIMP_TYPE Load(ChimpConstants::Flags flag, + uint8_t leading_zeros[], + uint32_t& leading_zero_index, + UnpackedData unpacked_data[], + uint32_t& unpacked_index, + DecompressState& state) { + if (DUCKDB_UNLIKELY(state.first)) { + return LoadFirst(state); + } else { + return DecompressValue(flag, leading_zeros, leading_zero_index, unpacked_data, unpacked_index, state); + } + } + + static inline CHIMP_TYPE LoadFirst(DecompressState& state) { + CHIMP_TYPE result = state.input.template ReadValue(); + state.ring_buffer.template InsertScan(result); + state.first = false; + state.reference_value = result; + return result; + } + + static inline CHIMP_TYPE DecompressValue(ChimpConstants::Flags flag, + uint8_t leading_zeros[], + uint32_t& leading_zero_index, + UnpackedData unpacked_data[], + uint32_t& unpacked_index, + DecompressState& state) { + CHIMP_TYPE result; + switch (flag) { + case ChimpConstants::Flags::VALUE_IDENTICAL: { + //! Value is identical to previous value + auto index = state.input.template ReadValue(); + result = state.ring_buffer.Value(index); + break; + } + case ChimpConstants::Flags::TRAILING_EXCEEDS_THRESHOLD: { + const UnpackedData& unpacked = unpacked_data[unpacked_index++]; + state.leading_zeros = unpacked.leading_zero; + state.trailing_zeros = BIT_SIZE - unpacked.significant_bits - state.leading_zeros; + result = state.input.template ReadValue(unpacked.significant_bits); + result <<= state.trailing_zeros; + result ^= state.ring_buffer.Value(unpacked.index); + break; + } + case ChimpConstants::Flags::LEADING_ZERO_EQUALITY: { + result = state.input.template ReadValue(BIT_SIZE - state.leading_zeros); + result ^= state.reference_value; + break; + } + case ChimpConstants::Flags::LEADING_ZERO_LOAD: { + state.leading_zeros = leading_zeros[leading_zero_index++]; + D_ASSERT(state.leading_zeros <= BIT_SIZE); + result = state.input.template ReadValue(BIT_SIZE - state.leading_zeros); + result ^= state.reference_value; + break; + } + default: + // std::cout << "Chimp compression flag with value not recognized "; + // std::cout << flag; + break; + // throw InternalException("Chimp compression flag with value %d not recognized", flag); + } + state.reference_value = result; + state.ring_buffer.InsertScan(result); + return result; + } +}; + +} // namespace alp_bench diff --git a/benchmarks/include/chimp/chimp_utils.hpp b/benchmarks/include/chimp/chimp_utils.hpp new file mode 100644 index 0000000..aab67b5 --- /dev/null +++ b/benchmarks/include/chimp/chimp_utils.hpp @@ -0,0 +1,139 @@ +//===----------------------------------------------------------------------===// +// DuckDB +// +// lwcbench/compression/chimp/algorithm/chimp_utils.hpp +// +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include "duckdb/duckdb.h" + +#ifdef _MSC_VER +#define __restrict__ +#define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__ +#define __ORDER_LITTLE_ENDIAN__ 2 +#include +static inline int __builtin_ctzll(unsigned long long x) { +#ifdef _WIN64 + unsigned long ret; + _BitScanForward64(&ret, x); + return (int)ret; +#else + unsigned long low, high; + bool low_set = _BitScanForward(&low, (unsigned __int32)(x)) != 0; + _BitScanForward(&high, (unsigned __int32)(x >> 32)); + high += 32; + return low_set ? low : high; +#endif +} +static inline int __builtin_clzll(unsigned long long mask) { + unsigned long where; +// BitScanReverse scans from MSB to LSB for first set bit. +// Returns 0 if no set bit is found. +#if defined(_WIN64) + if (_BitScanReverse64(&where, mask)) return static_cast(63 - where); +#elif defined(_WIN32) + // Scan the high 32 bits. + if (_BitScanReverse(&where, static_cast(mask >> 32))) + return static_cast(63 - (where + 32)); // Create a bit offset from the MSB. + // Scan the low 32 bits. + if (_BitScanReverse(&where, static_cast(mask))) return static_cast(63 - where); +#else +#error "Implementation of __builtin_clzll required" +#endif + return 64; // Undefined Behavior. +} + +static inline int __builtin_ctz(unsigned int value) { + unsigned long trailing_zero = 0; + + if (_BitScanForward(&trailing_zero, value)) { + return trailing_zero; + } else { + // This is undefined, I better choose 32 than 0 + return 32; + } +} + +static inline int __builtin_clz(unsigned int value) { + unsigned long leading_zero = 0; + + if (_BitScanReverse(&leading_zero, value)) { + return 31 - leading_zero; + } else { + // Same remarks as above + return 32; + } +} + +#endif + +namespace alp_bench { + +template +struct SignificantBits {}; + +template <> +struct SignificantBits { + static constexpr uint8_t size = 6; + static constexpr uint8_t mask = ((uint8_t)1 << size) - 1; +}; + +template <> +struct SignificantBits { + static constexpr uint8_t size = 5; + static constexpr uint8_t mask = ((uint8_t)1 << size) - 1; +}; + +template +struct CountZeros {}; + +template <> +struct CountZeros { + inline static int Leading(uint32_t value) { + if (!value) { return 32; } + return __builtin_clz(value); + } + inline static int Trailing(uint32_t value) { + if (!value) { return 32; } + return __builtin_ctz(value); + } +}; + +template <> +struct CountZeros { + inline static int Leading(uint64_t value) { + if (!value) { return 64; } + return __builtin_clzll(value); + } + inline static int Trailing(uint64_t value) { + if (!value) { return 64; } + return __builtin_ctzll(value); + } +}; + +struct ChimpConstants { + struct Compression { + static constexpr uint8_t LEADING_ROUND[] = {0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 12, 12, 12, 12, + 16, 16, 18, 18, 20, 20, 22, 22, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24}; + static constexpr uint8_t LEADING_REPRESENTATION[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7}; + }; + struct Decompression { + static constexpr uint8_t LEADING_REPRESENTATION[] = {0, 8, 12, 16, 18, 20, 22, 24}; + }; + static constexpr uint8_t BUFFER_SIZE = 128; + enum class Flags : uint8_t { + VALUE_IDENTICAL = 0, + TRAILING_EXCEEDS_THRESHOLD = 1, + LEADING_ZERO_EQUALITY = 2, + LEADING_ZERO_LOAD = 3 + }; +}; + +} // namespace alp_bench diff --git a/benchmarks/include/chimp/flag_buffer.hpp b/benchmarks/include/chimp/flag_buffer.hpp new file mode 100644 index 0000000..bd46b67 --- /dev/null +++ b/benchmarks/include/chimp/flag_buffer.hpp @@ -0,0 +1,125 @@ +//===----------------------------------------------------------------------===// +// DuckDB +// +// lwcbench/compression/chimp/flag_buffer.hpp +// +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include "chimp/chimp_utils.hpp" +#include "duckdb/duckdb.h" +#include "gorillas/gorillas_utils.hpp" +#ifdef DEBUG +#include "dependencies/assert.hpp" +#include "dependencies/vector.hpp" +#endif + +namespace alp_bench { + +struct FlagBufferConstants { + static constexpr uint8_t MASKS[4] = { + 192, // 0b1100 0000, + 48, // 0b0011 0000, + 12, // 0b0000 1100, + 3, // 0b0000 0011, + }; + + static constexpr uint8_t SHIFTS[4] = {6, 4, 2, 0}; +}; + +// This class is responsible for writing and reading the flag bits +// Only the last group is potentially not 1024 (GROUP_SIZE) values in size +// But we can determine from the count of the segment whether this is the case or not +// So we can just read/write from left to right +template +class FlagBuffer { + +public: + FlagBuffer() + : counter(0) + , buffer(nullptr) {} + +public: + void SetBuffer(uint8_t* buffer) { + this->buffer = buffer; + this->counter = 0; + } + void Reset() { + this->counter = 0; +#ifdef DEBUG + this->flags.clear(); +#endif + } + +#ifdef DEBUG + uint8_t ExtractValue(uint32_t value, uint8_t index) { + return (value & FlagBufferConstants::MASKS[index]) >> FlagBufferConstants::SHIFTS[index]; + } +#endif + + uint64_t BitsWritten() const { return counter * 2; } + + void Insert(ChimpConstants::Flags value) { + if (!EMPTY) { + if ((counter & 3) == 0) { + // Start the new byte fresh + buffer[counter >> 2] = 0; +#ifdef DEBUG + flags.clear(); +#endif + } +#ifdef DEBUG + flags.push_back((uint8_t)value); +#endif + buffer[counter >> 2] |= (((uint8_t)value & 3) << FlagBufferConstants::SHIFTS[counter & 3]); +#ifdef DEBUG + // Verify that the bits are serialized correctly + D_ASSERT(flags[counter & 3] == ExtractValue(buffer[counter >> 2], counter & 3)); +#endif + } + counter++; + } + + void Insert(GorillasConstants::Flags value) { + if (!EMPTY) { + if ((counter & 3) == 0) { + // Start the new byte fresh + buffer[counter >> 2] = 0; +#ifdef DEBUG + flags.clear(); +#endif + } +#ifdef DEBUG + flags.push_back((uint8_t)value); +#endif + buffer[counter >> 2] |= (((uint8_t)value & 3) << FlagBufferConstants::SHIFTS[counter & 3]); +#ifdef DEBUG + // Verify that the bits are serialized correctly + D_ASSERT(flags[counter & 3] == ExtractValue(buffer[counter >> 2], counter & 3)); +#endif + } + counter++; + } + inline uint8_t Extract() { + const uint8_t result = (buffer[counter >> 2] & FlagBufferConstants::MASKS[counter & 3]) >> + FlagBufferConstants::SHIFTS[counter & 3]; + counter++; + return result; + } + + uint32_t BytesUsed() const { return (counter >> 2) + ((counter & 3) != 0); } + + uint32_t FlagCount() const { return counter; } + +private: +private: + uint32_t counter = 0; + uint8_t* buffer; +#ifdef DEBUG + vector flags; +#endif +}; + +} // namespace alp_bench diff --git a/benchmarks/include/chimp/leading_zero_buffer.hpp b/benchmarks/include/chimp/leading_zero_buffer.hpp new file mode 100644 index 0000000..637931a --- /dev/null +++ b/benchmarks/include/chimp/leading_zero_buffer.hpp @@ -0,0 +1,155 @@ +//===----------------------------------------------------------------------===// +// DuckDB +// +// lwcbench/compression/chimp/leading_zero_buffer.hpp +// +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include "duckdb/duckdb.h" +#include "duckdb/helper.hpp" +#ifdef DEBUG +#include "dependencies/assert.hpp" +#include "dependencies/vector.hpp" +#endif + +namespace alp_bench { + +//! This class is in charge of storing the leading_zero_bits, which are of a fixed size +//! These are packed together so that the rest of the data can be byte-aligned +//! The leading zero bit data is read from left to right + +struct LeadingZeroBufferConstants { + static constexpr uint32_t MASKS[8] = { + 7, // 0b 00000000 00000000 00000000 00000111, + 56, // 0b 00000000 00000000 00000000 00111000, + 448, // 0b 00000000 00000000 00000001 11000000, + 3584, // 0b 00000000 00000000 00001110 00000000, + 28672, // 0b 00000000 00000000 01110000 00000000, + 229376, // 0b 00000000 00000011 10000000 00000000, + 1835008, // 0b 00000000 00011100 00000000 00000000, + 14680064, // 0b 00000000 11100000 00000000 00000000, + }; + + // We're not using the last byte (the most significant) of the 4 bytes we're accessing + static constexpr uint8_t SHIFTS[8] = {0, 3, 6, 9, 12, 15, 18, 21}; +}; + +template +class LeadingZeroBuffer { + +public: + static constexpr uint32_t CHIMP_GROUP_SIZE = 1024; + static constexpr uint32_t LEADING_ZERO_BITS_SIZE = 3; + static constexpr uint32_t LEADING_ZERO_BLOCK_SIZE = 8; + static constexpr uint32_t LEADING_ZERO_BLOCK_BIT_SIZE = LEADING_ZERO_BLOCK_SIZE * LEADING_ZERO_BITS_SIZE; + static constexpr uint32_t MAX_LEADING_ZERO_BLOCKS = CHIMP_GROUP_SIZE / LEADING_ZERO_BLOCK_SIZE; + static constexpr uint32_t MAX_BITS_USED_BY_ZERO_BLOCKS = MAX_LEADING_ZERO_BLOCKS * LEADING_ZERO_BLOCK_BIT_SIZE; + static constexpr uint32_t MAX_BYTES_USED_BY_ZERO_BLOCKS = MAX_BITS_USED_BY_ZERO_BLOCKS / 8; + + // Add an extra byte to prevent heap buffer overflow on the last group, because we'll be addressing 4 bytes each + static constexpr uint32_t BUFFER_SIZE = + MAX_BYTES_USED_BY_ZERO_BLOCKS + (sizeof(uint32_t) - (LEADING_ZERO_BLOCK_BIT_SIZE / 8)); + + template + const T Load(const uint8_t* ptr) { + T ret; + memcpy(&ret, ptr, sizeof(ret)); + return ret; + } + +public: + LeadingZeroBuffer() + : current(0) + , counter(0) + , buffer(nullptr) {} + void SetBuffer(uint8_t* buffer) { + // Set the internal buffer, when inserting this should be BUFFER_SIZE bytes in length + // This buffer does not need to be zero-initialized for inserting + this->buffer = buffer; + this->counter = 0; + } + void Flush() { + if ((counter & 7) != 0) { FlushBuffer(); } + } + + uint64_t BitsWritten() const { return counter * 3; } + + // Reset the counter, but don't replace the buffer + void Reset() { + this->counter = 0; + current = 0; +#ifdef DEBUG + flags.clear(); +#endif + } + +public: +#ifdef DEBUG + uint8_t ExtractValue(uint32_t value, uint8_t index) { + return (value & LeadingZeroBufferConstants::MASKS[index]) >> LeadingZeroBufferConstants::SHIFTS[index]; + } +#endif + + inline uint64_t BlockIndex() const { return ((counter >> 3) * (LEADING_ZERO_BLOCK_BIT_SIZE / 8)); } + + void FlushBuffer() { + if (EMPTY) { return; } + const auto buffer_idx = BlockIndex(); + memcpy((void*)(buffer + buffer_idx), (uint8_t*)¤t, 3); +#ifdef DEBUG + // Verify that the bits are copied correctly + + uint32_t temp_value = 0; + memcpy((uint8_t*)&temp_value, (void*)(buffer + buffer_idx), 3); + for (idx_t i = 0; i < flags.size(); i++) { + D_ASSERT(flags[i] == ExtractValue(temp_value, i)); + } + flags.clear(); +#endif + } + + void Insert(const uint8_t& value) { + if (!EMPTY) { +#ifdef DEBUG + flags.push_back(value); +#endif + current |= (value & 7) << LeadingZeroBufferConstants::SHIFTS[counter & 7]; +#ifdef DEBUG + // Verify that the bits are serialized correctly + D_ASSERT(flags[counter & 7] == ExtractValue(current, counter & 7)); +#endif + + if ((counter & (LEADING_ZERO_BLOCK_SIZE - 1)) == 7) { + FlushBuffer(); + current = 0; + } + } + counter++; + } + + inline uint8_t Extract() { + const auto buffer_idx = BlockIndex(); + auto const temp = Load(buffer + buffer_idx); + + const uint8_t result = + (temp & LeadingZeroBufferConstants::MASKS[counter & 7]) >> LeadingZeroBufferConstants::SHIFTS[counter & 7]; + counter++; + return result; + } + idx_t GetCount() const { return counter; } + idx_t BlockCount() const { return (counter >> 3) + ((counter & 7) != 0); } + +private: +private: + uint32_t current; + uint32_t counter = 0; // block_index * 8 + uint8_t* buffer; +#ifdef DEBUG + vector flags; +#endif +}; + +} // namespace alp_bench diff --git a/benchmarks/include/chimp/output_bit_stream.hpp b/benchmarks/include/chimp/output_bit_stream.hpp new file mode 100644 index 0000000..b06a3c7 --- /dev/null +++ b/benchmarks/include/chimp/output_bit_stream.hpp @@ -0,0 +1,178 @@ +//===----------------------------------------------------------------------===// +// DuckDB +// +// lwcbench/compression/chimp/output_bit_stream.hpp +// +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include "bit_utils.hpp" +#include "duckdb/assert.hpp" +#include "duckdb/duckdb.h" + +namespace alp_bench { + +// This class writes arbitrary amounts of bits to a stream +// The way these bits are written is most-significant bit first +// For example if 6 bits are given as: 0b0011 1111 +// The bits are written to the stream as: 0b1111 1100 +template +class OutputBitStream { + using INTERNAL_TYPE = uint8_t; + +public: + friend class BitStreamWriter; + friend class EmptyWriter; + OutputBitStream() + : stream(nullptr) + , current(0) + , free_bits(INTERNAL_TYPE_BITSIZE) + , stream_index(0) + , bits_written(0) {} + +public: + static constexpr uint8_t INTERNAL_TYPE_BITSIZE = sizeof(INTERNAL_TYPE) * 8; + + idx_t BytesWritten() const { return (bits_written >> 3) + ((bits_written & 7) != 0); } + + idx_t BitsWritten() const { return bits_written; } + + void Flush() { + if (free_bits == INTERNAL_TYPE_BITSIZE) { + // the bit buffer is empty, nothing to write + return; + } + WriteToStream(); + } + + void SetStream(uint8_t* output_stream) { + stream = output_stream; + stream_index = 0; + bits_written = 0; + free_bits = INTERNAL_TYPE_BITSIZE; + current = 0; + } + + uint64_t* Stream() { return (uint64_t*)stream; } + + idx_t BitSize() const { return (stream_index * INTERNAL_TYPE_BITSIZE) + (INTERNAL_TYPE_BITSIZE - free_bits); } + + template + void WriteRemainder(T value, uint8_t i) { + if (sizeof(T) * 8 > 32) { + if (i == 64) { WriteToStream(((uint64_t)value >> 56) & 0xFF); } + if (i > 55) { WriteToStream(((uint64_t)value >> 48) & 0xFF); } + if (i > 47) { WriteToStream(((uint64_t)value >> 40) & 0xFF); } + if (i > 39) { WriteToStream(((uint64_t)value >> 32) & 0xFF); } + } + if (i > 31) { WriteToStream((value >> 24) & 0xFF); } + if (i > 23) { WriteToStream((value >> 16) & 0xFF); } + if (i > 15) { WriteToStream((value >> 8) & 0xFF); } + if (i > 7) { WriteToStream(value); } + } + + template + void WriteValue(T value) { + bits_written += VALUE_SIZE; + if (EMPTY) { return; } + if (FitsInCurrent(VALUE_SIZE)) { + //! If we can write the entire value in one go + WriteInCurrent((INTERNAL_TYPE)value); + return; + } + auto i = VALUE_SIZE - free_bits; + const uint8_t queue = i & 7; + + if (free_bits != 0) { + // Reset the number of free bits + WriteInCurrent(value >> i, free_bits); + } + if (queue != 0) { + // We dont fill the entire 'current' buffer, + // so we can write these to 'current' first without flushing to the stream + // And then write the remaining bytes directly to the stream + i -= queue; + WriteInCurrent((INTERNAL_TYPE)value, queue); + value >>= queue; + } + WriteRemainder(value, i); + } + + template + void WriteValue(T value, const uint8_t& value_size) { + bits_written += value_size; + if (EMPTY) { return; } + if (FitsInCurrent(value_size)) { + //! If we can write the entire value in one go + WriteInCurrent((INTERNAL_TYPE)value, value_size); + return; + } + auto i = value_size - free_bits; + const uint8_t queue = i & 7; + + if (free_bits != 0) { + // Reset the number of free bits + WriteInCurrent(value >> i, free_bits); + } + if (queue != 0) { + // We dont fill the entire 'current' buffer, + // so we can write these to 'current' first without flushing to the stream + // And then write the remaining bytes directly to the stream + i -= queue; + WriteInCurrent((INTERNAL_TYPE)value, queue); + value >>= queue; + } + WriteRemainder(value, i); + } + +private: + void WriteBit(bool value) { + auto& byte = GetCurrentByte(); + if (value) { byte = byte | GetMask(); } + DecreaseFreeBits(); + } + + bool FitsInCurrent(uint8_t bits) { return free_bits >= bits; } + INTERNAL_TYPE GetMask() const { return (INTERNAL_TYPE)1 << free_bits; } + + INTERNAL_TYPE& GetCurrentByte() { return current; } + //! Write a value of type INTERNAL_TYPE directly to the stream + void WriteToStream(INTERNAL_TYPE value) { stream[stream_index++] = value; } + void WriteToStream() { + stream[stream_index++] = current; + current = 0; + free_bits = INTERNAL_TYPE_BITSIZE; + } + void DecreaseFreeBits(uint8_t value = 1) { + D_ASSERT(free_bits >= value); + free_bits -= value; + if (free_bits == 0) { WriteToStream(); } + } + void WriteInCurrent(INTERNAL_TYPE value, uint8_t value_size) { + D_ASSERT(INTERNAL_TYPE_BITSIZE >= value_size); + const auto shift_amount = free_bits - value_size; + current |= (value & BitUtils::Mask(value_size)) << shift_amount; + DecreaseFreeBits(value_size); + } + + template + void WriteInCurrent(INTERNAL_TYPE value) { + D_ASSERT(INTERNAL_TYPE_BITSIZE >= VALUE_SIZE); + const auto shift_amount = free_bits - VALUE_SIZE; + current |= (value & BitUtils::Mask(VALUE_SIZE)) << shift_amount; + DecreaseFreeBits(VALUE_SIZE); + } + +private: + uint8_t* stream; //! The stream we're writing our output to + + INTERNAL_TYPE current; //! The current value we're writing into (zero-initialized) + uint8_t free_bits; //! How many bits are still unwritten in 'current' + idx_t stream_index; //! Index used to keep track of which index we're at in the stream + + idx_t bits_written; //! The total amount of bits written to this stream +}; + +} // namespace alp_bench diff --git a/benchmarks/include/chimp/packed_data.hpp b/benchmarks/include/chimp/packed_data.hpp new file mode 100644 index 0000000..90a7d2d --- /dev/null +++ b/benchmarks/include/chimp/packed_data.hpp @@ -0,0 +1,87 @@ +//===----------------------------------------------------------------------===// +// DuckDB +// +// lwcbench/compression/chimp/packed_data.hpp +// +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include "chimp_utils.hpp" +#include "duckdb/duckdb.h" + +namespace alp_bench { + +struct UnpackedData { + uint8_t leading_zero; + uint8_t significant_bits; + uint8_t index; +}; + +template +struct PackedDataUtils { +private: + static constexpr uint8_t INDEX_BITS_SIZE = 7; + static constexpr uint8_t LEADING_BITS_SIZE = 3; + + static constexpr uint8_t INDEX_MASK = ((uint8_t)1 << INDEX_BITS_SIZE) - 1; + static constexpr uint8_t LEADING_MASK = ((uint8_t)1 << LEADING_BITS_SIZE) - 1; + + static constexpr uint8_t INDEX_SHIFT_AMOUNT = (sizeof(uint16_t) * 8) - INDEX_BITS_SIZE; + static constexpr uint8_t LEADING_SHIFT_AMOUNT = INDEX_SHIFT_AMOUNT - LEADING_BITS_SIZE; + +public: + //|----------------| //! packed_data(16) bits + // IIIIIII //! Index (7 bits, shifted by 9) + // LLL //! LeadingZeros (3 bits, shifted by 6) + // SSSSSS //! SignificantBits (6 bits) + static inline void Unpack(uint16_t packed_data, UnpackedData& dest) { + dest.index = packed_data >> INDEX_SHIFT_AMOUNT & INDEX_MASK; + dest.leading_zero = packed_data >> LEADING_SHIFT_AMOUNT & LEADING_MASK; + dest.significant_bits = packed_data & SignificantBits::mask; + // Verify that combined, this is not bigger than the full size of the type + D_ASSERT(dest.significant_bits + dest.leading_zero <= (sizeof(CHIMP_TYPE) * 8)); + } + + static inline uint16_t Pack(uint8_t index, uint8_t leading_zero, uint8_t significant_bits) { + static constexpr uint8_t BIT_SIZE = (sizeof(CHIMP_TYPE) * 8); + + uint16_t result = 0; + result += ((uint32_t)BIT_SIZE << 3) * (ChimpConstants::BUFFER_SIZE + index); + result += BIT_SIZE * (leading_zero & 7); + if (BIT_SIZE == 32) { + // Shift the result by 1 to occupy the 16th bit + result <<= 1; + } + result += (significant_bits & 63); + + return result; + } +}; + +template +struct PackedDataBuffer { +public: + PackedDataBuffer() + : index(0) + , buffer(nullptr) {} + +public: + void SetBuffer(uint16_t* buffer) { + this->buffer = buffer; + this->index = 0; + } + + void Reset() { this->index = 0; } + + inline void Insert(uint16_t packed_data) { + if (!EMPTY) { buffer[index] = packed_data; } + index++; + } + + idx_t index; + uint16_t* buffer; +}; + +} // namespace alp_bench diff --git a/benchmarks/include/chimp/ring_buffer.hpp b/benchmarks/include/chimp/ring_buffer.hpp new file mode 100644 index 0000000..ea15917 --- /dev/null +++ b/benchmarks/include/chimp/ring_buffer.hpp @@ -0,0 +1,54 @@ +//===----------------------------------------------------------------------===// +// DuckDB +// +// lwcbench/compression/chimp/ring_buffer.hpp +// +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include "chimp_utils.hpp" + +namespace alp_bench { + +template +class RingBuffer { +public: + static constexpr uint8_t RING_SIZE = ChimpConstants::BUFFER_SIZE; + static constexpr uint64_t LEAST_SIGNIFICANT_BIT_COUNT = SignificantBits::size + 7 + 1; + static constexpr uint64_t LEAST_SIGNIFICANT_BIT_MASK = (1 << LEAST_SIGNIFICANT_BIT_COUNT) - 1; + static constexpr uint16_t INDICES_SIZE = 1 << LEAST_SIGNIFICANT_BIT_COUNT; // 16384 + +public: + void Reset() { index = 0; } + + RingBuffer() + : index(0) {} + template + void Insert(uint64_t value) { + if (!FIRST) { index++; } + buffer[index % RING_SIZE] = value; + indices[Key(value)] = index; + } + template + void InsertScan(uint64_t value) { + if (!FIRST) { index++; } + buffer[index % RING_SIZE] = value; + } + inline const uint64_t& Top() const { return buffer[index % RING_SIZE]; } + //! Get the index where values that produce this 'key' are stored + inline const uint64_t& IndexOf(const uint64_t& key) const { return indices[key]; } + //! Get the value at position 'index' of the buffer + inline const uint64_t& Value(const uint8_t& index_p) const { return buffer[index_p]; } + //! Get the amount of values that are inserted + inline const uint64_t& Size() const { return index; } + inline uint64_t Key(const uint64_t& value) const { return value & LEAST_SIGNIFICANT_BIT_MASK; } + +private: + uint64_t buffer[RING_SIZE] = {}; //! Stores the corresponding values + uint64_t index = 0; //! Keeps track of the index of the current value + uint64_t indices[INDICES_SIZE] = {}; //! Stores the corresponding indices +}; + +} // namespace alp_bench diff --git a/benchmarks/include/duckdb/assert.hpp b/benchmarks/include/duckdb/assert.hpp new file mode 100644 index 0000000..c598a04 --- /dev/null +++ b/benchmarks/include/duckdb/assert.hpp @@ -0,0 +1,16 @@ +#include "winapi.hpp" + +#pragma once + +#if (defined(DUCKDB_USE_STANDARD_ASSERT) || !defined(DEBUG)) && !defined(DUCKDB_FORCE_ASSERT) + +#include +#define D_ASSERT assert +#else +namespace lwcbench { +DUCKDB_API void DuckDBAssertInternal(bool condition, const char* condition_name, const char* file, int linenr); +} + +#define D_ASSERT(condition) duckdb::DuckDBAssertInternal(bool(condition), #condition, __FILE__, __LINE__) + +#endif diff --git a/benchmarks/include/duckdb/common.hpp b/benchmarks/include/duckdb/common.hpp new file mode 100644 index 0000000..5b6f0a7 --- /dev/null +++ b/benchmarks/include/duckdb/common.hpp @@ -0,0 +1,4 @@ +#pragma once + +#include "constants.hpp" +#include "helper.hpp" diff --git a/benchmarks/include/duckdb/constants.hpp b/benchmarks/include/duckdb/constants.hpp new file mode 100644 index 0000000..8f0caab --- /dev/null +++ b/benchmarks/include/duckdb/constants.hpp @@ -0,0 +1,132 @@ +#pragma once + +#include "string.hpp" +#include "winapi.hpp" +#include +#include + +namespace alp_bench { + +// API versions +// if no explicit API version is defined, the latest API version is used +// Note that using older API versions (i.e. not using DUCKDB_API_LATEST) is deprecated. +// These will not be supported long-term, and will be removed in future versions. + +#ifndef DUCKDB_API_0_3_1 +#define DUCKDB_API_0_3_1 1 +#endif +#ifndef DUCKDB_API_0_3_2 +#define DUCKDB_API_0_3_2 2 +#endif +#ifndef DUCKDB_API_LATEST +#define DUCKDB_API_LATEST DUCKDB_API_0_3_2 +#endif + +#ifndef DUCKDB_API_VERSION +#define DUCKDB_API_VERSION DUCKDB_API_LATEST +#endif + +//! inline std directives that we use frequently +#ifndef DUCKDB_DEBUG_MOVE +using std::move; +#endif +using std::shared_ptr; +using std::unique_ptr; +using std::weak_ptr; +using data_ptr = unique_ptr; +using std::make_shared; + +// NOTE: there is a copy of this in the Postgres' parser grammar (gram.y) +#define DEFAULT_SCHEMA "main" +#define INVALID_SCHEMA "" +#define INVALID_CATALOG "" +#define SYSTEM_CATALOG "system" +#define TEMP_CATALOG "temp" + +DUCKDB_API bool IsInvalidSchema(const string& str); +DUCKDB_API bool IsInvalidCatalog(const string& str); + +//! a saner size_t for loop indices etc +typedef uint64_t idx_t; + +//! The type used for row identifiers +typedef int64_t row_t; + +//! The type used for hashes +typedef uint64_t hash_t; + +//! data pointers +typedef uint8_t data_t; +typedef data_t* data_ptr_t; +typedef const data_t* const_data_ptr_t; + +//! Type used for the selection vector +typedef uint32_t sel_t; +//! Type used for transaction timestamps +typedef idx_t transaction_t; + +//! Type used for column identifiers +typedef idx_t column_t; +//! Type used for storage (column) identifiers +typedef idx_t storage_t; +//! Special value used to signify the ROW ID of a table +DUCKDB_API extern const column_t COLUMN_IDENTIFIER_ROW_ID; +DUCKDB_API bool IsRowIdColumnId(column_t column_id); + +//! The maximum row identifier used in tables +extern const row_t MAX_ROW_ID; + +extern const transaction_t TRANSACTION_ID_START; +extern const transaction_t MAX_TRANSACTION_ID; +extern const transaction_t MAXIMUM_QUERY_ID; +extern const transaction_t NOT_DELETED_ID; + +extern const double PI; + +struct DConstants { + //! The value used to signify an invalid index entry + static constexpr const idx_t INVALID_INDEX = idx_t(-1); +}; + +struct Storage { + //! The size of a hard disk sector, only really needed for Direct IO + constexpr static int SECTOR_SIZE = 4096; + //! Block header size for blocks written to the storage + constexpr static int BLOCK_HEADER_SIZE = sizeof(uint64_t); + // Size of a memory slot managed by the StorageManager. This is the quantum of allocation for Blocks on DuckDB. We + // default to 256KB. (1 << 18) + constexpr static int BLOCK_ALLOC_SIZE = 262144; + //! The actual memory space that is available within the blocks + constexpr static int BLOCK_SIZE = BLOCK_ALLOC_SIZE - BLOCK_HEADER_SIZE; + //! The size of the headers. This should be small and written more or less atomically by the hard disk. We default + //! to the page size, which is 4KB. (1 << 12) + constexpr static int FILE_HEADER_SIZE = 4096; +}; + +struct LogicalIndex { + explicit LogicalIndex(idx_t index) + : index(index) {} + + idx_t index; + + inline bool operator==(const LogicalIndex& rhs) const { return index == rhs.index; }; + inline bool operator!=(const LogicalIndex& rhs) const { return index != rhs.index; }; + inline bool operator<(const LogicalIndex& rhs) const { return index < rhs.index; }; + bool IsValid() { return index != DConstants::INVALID_INDEX; } +}; + +struct PhysicalIndex { + explicit PhysicalIndex(idx_t index) + : index(index) {} + + idx_t index; + + inline bool operator==(const PhysicalIndex& rhs) const { return index == rhs.index; }; + inline bool operator!=(const PhysicalIndex& rhs) const { return index != rhs.index; }; + inline bool operator<(const PhysicalIndex& rhs) const { return index < rhs.index; }; + bool IsValid() { return index != DConstants::INVALID_INDEX; } +}; + +DUCKDB_API uint64_t NextPowerOfTwo(uint64_t v); + +} // namespace alp_bench diff --git a/benchmarks/include/duckdb/duckdb.h b/benchmarks/include/duckdb/duckdb.h new file mode 100644 index 0000000..7225dc4 --- /dev/null +++ b/benchmarks/include/duckdb/duckdb.h @@ -0,0 +1,2304 @@ +#pragma once + +// duplicate of duckdb/main/winapi.hpp +#ifndef DUCKDB_API +#ifdef _WIN32 +#if defined(DUCKDB_BUILD_LIBRARY) && !defined(DUCKDB_BUILD_LOADABLE_EXTENSION) +#define DUCKDB_API __declspec(dllexport) +#else +#define DUCKDB_API __declspec(dllimport) +#endif +#else +#define DUCKDB_API +#endif +#endif + +// duplicate of duckdb/main/winapi.hpp +#ifndef DUCKDB_EXTENSION_API +#ifdef _WIN32 +#ifdef DUCKDB_BUILD_LOADABLE_EXTENSION +#define DUCKDB_EXTENSION_API __declspec(dllexport) +#else +#define DUCKDB_EXTENSION_API +#endif +#else +#define DUCKDB_EXTENSION_API __attribute__((visibility("default"))) +#endif +#endif + +// duplicate of lwcbench/dependencies/constants.hpp +#ifndef DUCKDB_API_0_3_1 +#define DUCKDB_API_0_3_1 1 +#endif +#ifndef DUCKDB_API_0_3_2 +#define DUCKDB_API_0_3_2 2 +#endif +#ifndef DUCKDB_API_LATEST +#define DUCKDB_API_LATEST DUCKDB_API_0_3_2 +#endif + +#ifndef DUCKDB_API_VERSION +#define DUCKDB_API_VERSION DUCKDB_API_LATEST +#endif + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +//===--------------------------------------------------------------------===// +// Type Information +//===--------------------------------------------------------------------===// +typedef uint64_t idx_t; + +typedef enum DUCKDB_TYPE { + DUCKDB_TYPE_INVALID = 0, + // bool + DUCKDB_TYPE_BOOLEAN, + // int8_t + DUCKDB_TYPE_TINYINT, + // int16_t + DUCKDB_TYPE_SMALLINT, + // int32_t + DUCKDB_TYPE_INTEGER, + // int64_t + DUCKDB_TYPE_BIGINT, + // uint8_t + DUCKDB_TYPE_UTINYINT, + // uint16_t + DUCKDB_TYPE_USMALLINT, + // uint32_t + DUCKDB_TYPE_UINTEGER, + // uint64_t + DUCKDB_TYPE_UBIGINT, + // float + DUCKDB_TYPE_FLOAT, + // double + DUCKDB_TYPE_DOUBLE, + // duckdb_timestamp, in microseconds + DUCKDB_TYPE_TIMESTAMP, + // duckdb_date + DUCKDB_TYPE_DATE, + // duckdb_time + DUCKDB_TYPE_TIME, + // duckdb_interval + DUCKDB_TYPE_INTERVAL, + // duckdb_hugeint + DUCKDB_TYPE_HUGEINT, + // const char* + DUCKDB_TYPE_VARCHAR, + // duckdb_blob + DUCKDB_TYPE_BLOB, + // decimal + DUCKDB_TYPE_DECIMAL, + // duckdb_timestamp, in seconds + DUCKDB_TYPE_TIMESTAMP_S, + // duckdb_timestamp, in milliseconds + DUCKDB_TYPE_TIMESTAMP_MS, + // duckdb_timestamp, in nanoseconds + DUCKDB_TYPE_TIMESTAMP_NS, + // enum type, only useful as logical type + DUCKDB_TYPE_ENUM, + // list type, only useful as logical type + DUCKDB_TYPE_LIST, + // struct type, only useful as logical type + DUCKDB_TYPE_STRUCT, + // map type, only useful as logical type + DUCKDB_TYPE_MAP, + // duckdb_hugeint + DUCKDB_TYPE_UUID, + // union type, only useful as logical type + DUCKDB_TYPE_UNION, +} duckdb_type; + +//! Days are stored as days since 1970-01-01 +//! Use the duckdb_from_date/duckdb_to_date function to extract individual information +typedef struct { + int32_t days; +} duckdb_date; + +typedef struct { + int32_t year; + int8_t month; + int8_t day; +} duckdb_date_struct; + +//! Time is stored as microseconds since 00:00:00 +//! Use the duckdb_from_time/duckdb_to_time function to extract individual information +typedef struct { + int64_t micros; +} duckdb_time; + +typedef struct { + int8_t hour; + int8_t min; + int8_t sec; + int32_t micros; +} duckdb_time_struct; + +//! Timestamps are stored as microseconds since 1970-01-01 +//! Use the duckdb_from_timestamp/duckdb_to_timestamp function to extract individual information +typedef struct { + int64_t micros; +} duckdb_timestamp; + +typedef struct { + duckdb_date_struct date; + duckdb_time_struct time; +} duckdb_timestamp_struct; + +typedef struct { + int32_t months; + int32_t days; + int64_t micros; +} duckdb_interval; + +//! Hugeints are composed in a (lower, upper) component +//! The value of the hugeint is upper * 2^64 + lower +//! For easy usage, the functions duckdb_hugeint_to_double/duckdb_double_to_hugeint are recommended +typedef struct { + uint64_t lower; + int64_t upper; +} duckdb_hugeint; + +typedef struct { + uint8_t width; + uint8_t scale; + + duckdb_hugeint value; +} duckdb_decimal; + +typedef struct { + char* data; + idx_t size; +} duckdb_string; + +typedef struct { + void* data; + idx_t size; +} duckdb_blob; + +typedef struct { +#if DUCKDB_API_VERSION < DUCKDB_API_0_3_2 + void* data; + bool* nullmask; + duckdb_type type; + char* name; +#else + // deprecated, use duckdb_column_data + void* __deprecated_data; + // deprecated, use duckdb_nullmask_data + bool* __deprecated_nullmask; + // deprecated, use duckdb_column_type + duckdb_type __deprecated_type; + // deprecated, use duckdb_column_name + char* __deprecated_name; +#endif + void* internal_data; +} duckdb_column; + +typedef struct { +#if DUCKDB_API_VERSION < DUCKDB_API_0_3_2 + idx_t column_count; + idx_t row_count; + idx_t rows_changed; + duckdb_column* columns; + char* error_message; +#else + // deprecated, use duckdb_column_count + idx_t __deprecated_column_count; + // deprecated, use duckdb_row_count + idx_t __deprecated_row_count; + // deprecated, use duckdb_rows_changed + idx_t __deprecated_rows_changed; + // deprecated, use duckdb_column_ family of functions + duckdb_column* __deprecated_columns; + // deprecated, use duckdb_result_error + char* __deprecated_error_message; +#endif + void* internal_data; +} duckdb_result; + +typedef struct _duckdb_database { + void* __db; +}* duckdb_database; +typedef struct _duckdb_connection { + void* __conn; +}* duckdb_connection; +typedef struct _duckdb_prepared_statement { + void* __prep; +}* duckdb_prepared_statement; +typedef struct _duckdb_extracted_statements { + void* __extrac; +}* duckdb_extracted_statements; +typedef struct _duckdb_pending_result { + void* __pend; +}* duckdb_pending_result; +typedef struct _duckdb_appender { + void* __appn; +}* duckdb_appender; +typedef struct _duckdb_arrow { + void* __arrw; +}* duckdb_arrow; +typedef struct _duckdb_config { + void* __cnfg; +}* duckdb_config; +typedef struct _duckdb_arrow_schema { + void* __arrs; +}* duckdb_arrow_schema; +typedef struct _duckdb_arrow_array { + void* __arra; +}* duckdb_arrow_array; +typedef struct _duckdb_logical_type { + void* __lglt; +}* duckdb_logical_type; +typedef struct _duckdb_data_chunk { + void* __dtck; +}* duckdb_data_chunk; +typedef struct _duckdb_vector { + void* __vctr; +}* duckdb_vector; +typedef struct _duckdb_value { + void* __val; +}* duckdb_value; + +typedef enum { DuckDBSuccess = 0, DuckDBError = 1 } duckdb_state; +typedef enum { + DUCKDB_PENDING_RESULT_READY = 0, + DUCKDB_PENDING_RESULT_NOT_READY = 1, + DUCKDB_PENDING_ERROR = 2 +} duckdb_pending_state; + +//===--------------------------------------------------------------------===// +// Open/Connect +//===--------------------------------------------------------------------===// + +/*! +Creates a new database or opens an existing database file stored at the the given path. +If no path is given a new in-memory database is created instead. + +* path: Path to the database file on disk, or `nullptr` or `:memory:` to open an in-memory database. +* out_database: The result database object. +* returns: `DuckDBSuccess` on success or `DuckDBError` on failure. +*/ +DUCKDB_API duckdb_state duckdb_open(const char* path, duckdb_database* out_database); + +/*! +Extended version of duckdb_open. Creates a new database or opens an existing database file stored at the the given path. + +* path: Path to the database file on disk, or `nullptr` or `:memory:` to open an in-memory database. +* out_database: The result database object. +* config: (Optional) configuration used to start up the database system. +* out_error: If set and the function returns DuckDBError, this will contain the reason why the start-up failed. +Note that the error must be freed using `duckdb_free`. +* returns: `DuckDBSuccess` on success or `DuckDBError` on failure. +*/ +DUCKDB_API duckdb_state duckdb_open_ext(const char* path, + duckdb_database* out_database, + duckdb_config config, + char** out_error); + +/*! +Closes the specified database and de-allocates all memory allocated for that database. +This should be called after you are done with any database allocated through `duckdb_open`. +Note that failing to call `duckdb_close` (in case of e.g. a program crash) will not cause data corruption. +Still it is recommended to always correctly close a database object after you are done with it. + +* database: The database object to shut down. +*/ +DUCKDB_API void duckdb_close(duckdb_database* database); + +/*! +Opens a connection to a database. Connections are required to query the database, and store transactional state +associated with the connection. + +* database: The database file to connect to. +* out_connection: The result connection object. +* returns: `DuckDBSuccess` on success or `DuckDBError` on failure. +*/ +DUCKDB_API duckdb_state duckdb_connect(duckdb_database database, duckdb_connection* out_connection); + +/*! +Closes the specified connection and de-allocates all memory allocated for that connection. + +* connection: The connection to close. +*/ +DUCKDB_API void duckdb_disconnect(duckdb_connection* connection); + +/*! +Returns the version of the linked DuckDB, with a version postfix for dev versions + +Usually used for developing C extensions that must return this for a compatibility check. +*/ +DUCKDB_API const char* duckdb_library_version(); + +//===--------------------------------------------------------------------===// +// Configuration +//===--------------------------------------------------------------------===// +/*! +Initializes an empty configuration object that can be used to provide start-up options for the DuckDB instance +through `duckdb_open_ext`. + +This will always succeed unless there is a malloc failure. + +* out_config: The result configuration object. +* returns: `DuckDBSuccess` on success or `DuckDBError` on failure. +*/ +DUCKDB_API duckdb_state duckdb_create_config(duckdb_config* out_config); + +/*! +This returns the total amount of configuration options available for usage with `duckdb_get_config_flag`. + +This should not be called in a loop as it internally loops over all the options. + +* returns: The amount of config options available. +*/ +DUCKDB_API size_t duckdb_config_count(); + +/*! +Obtains a human-readable name and description of a specific configuration option. This can be used to e.g. +display configuration options. This will succeed unless `index` is out of range (i.e. `>= duckdb_config_count`). + +The result name or description MUST NOT be freed. + +* index: The index of the configuration option (between 0 and `duckdb_config_count`) +* out_name: A name of the configuration flag. +* out_description: A description of the configuration flag. +* returns: `DuckDBSuccess` on success or `DuckDBError` on failure. +*/ +DUCKDB_API duckdb_state duckdb_get_config_flag(size_t index, const char** out_name, const char** out_description); + +/*! +Sets the specified option for the specified configuration. The configuration option is indicated by name. +To obtain a list of config options, see `duckdb_get_config_flag`. + +In the source code, configuration options are defined in `config.cpp`. + +This can fail if either the name is invalid, or if the value provided for the option is invalid. + +* duckdb_config: The configuration object to set the option on. +* name: The name of the configuration flag to set. +* option: The value to set the configuration flag to. +* returns: `DuckDBSuccess` on success or `DuckDBError` on failure. +*/ +DUCKDB_API duckdb_state duckdb_set_config(duckdb_config config, const char* name, const char* option); + +/*! +Destroys the specified configuration option and de-allocates all memory allocated for the object. + +* config: The configuration object to destroy. +*/ +DUCKDB_API void duckdb_destroy_config(duckdb_config* config); + +//===--------------------------------------------------------------------===// +// Query Execution +//===--------------------------------------------------------------------===// +/*! +Executes a SQL query within a connection and stores the full (materialized) result in the out_result pointer. +If the query fails to execute, DuckDBError is returned and the error message can be retrieved by calling +`duckdb_result_error`. + +Note that after running `duckdb_query`, `duckdb_destroy_result` must be called on the result object even if the +query fails, otherwise the error stored within the result will not be freed correctly. + +* connection: The connection to perform the query in. +* query: The SQL query to run. +* out_result: The query result. +* returns: `DuckDBSuccess` on success or `DuckDBError` on failure. +*/ +DUCKDB_API duckdb_state duckdb_query(duckdb_connection connection, const char* query, duckdb_result* out_result); + +/*! +Closes the result and de-allocates all memory allocated for that connection. + +* result: The result to destroy. +*/ +DUCKDB_API void duckdb_destroy_result(duckdb_result* result); + +/*! +Returns the column name of the specified column. The result should not need be freed; the column names will +automatically be destroyed when the result is destroyed. + +Returns `NULL` if the column is out of range. + +* result: The result object to fetch the column name from. +* col: The column index. +* returns: The column name of the specified column. +*/ +DUCKDB_API const char* duckdb_column_name(duckdb_result* result, idx_t col); + +/*! +Returns the column type of the specified column. + +Returns `DUCKDB_TYPE_INVALID` if the column is out of range. + +* result: The result object to fetch the column type from. +* col: The column index. +* returns: The column type of the specified column. +*/ +DUCKDB_API duckdb_type duckdb_column_type(duckdb_result* result, idx_t col); + +/*! +Returns the logical column type of the specified column. + +The return type of this call should be destroyed with `duckdb_destroy_logical_type`. + +Returns `NULL` if the column is out of range. + +* result: The result object to fetch the column type from. +* col: The column index. +* returns: The logical column type of the specified column. +*/ +DUCKDB_API duckdb_logical_type duckdb_column_logical_type(duckdb_result* result, idx_t col); + +/*! +Returns the number of columns present in a the result object. + +* result: The result object. +* returns: The number of columns present in the result object. +*/ +DUCKDB_API idx_t duckdb_column_count(duckdb_result* result); + +/*! +Returns the number of rows present in a the result object. + +* result: The result object. +* returns: The number of rows present in the result object. +*/ +DUCKDB_API idx_t duckdb_row_count(duckdb_result* result); + +/*! +Returns the number of rows changed by the query stored in the result. This is relevant only for INSERT/UPDATE/DELETE +queries. For other queries the rows_changed will be 0. + +* result: The result object. +* returns: The number of rows changed. +*/ +DUCKDB_API idx_t duckdb_rows_changed(duckdb_result* result); + +/*! +**DEPRECATED**: Prefer using `duckdb_result_get_chunk` instead. + +Returns the data of a specific column of a result in columnar format. + +The function returns a dense array which contains the result data. The exact type stored in the array depends on the +corresponding duckdb_type (as provided by `duckdb_column_type`). For the exact type by which the data should be +accessed, see the comments in [the types section](types) or the `DUCKDB_TYPE` enum. + +For example, for a column of type `DUCKDB_TYPE_INTEGER`, rows can be accessed in the following manner: +```c +int32_t *data = (int32_t *) duckdb_column_data(&result, 0); +printf("Data for row %d: %d\n", row, data[row]); +``` + +* result: The result object to fetch the column data from. +* col: The column index. +* returns: The column data of the specified column. +*/ +DUCKDB_API void* duckdb_column_data(duckdb_result* result, idx_t col); + +/*! +**DEPRECATED**: Prefer using `duckdb_result_get_chunk` instead. + +Returns the nullmask of a specific column of a result in columnar format. The nullmask indicates for every row +whether or not the corresponding row is `NULL`. If a row is `NULL`, the values present in the array provided +by `duckdb_column_data` are undefined. + +```c +int32_t *data = (int32_t *) duckdb_column_data(&result, 0); +bool *nullmask = duckdb_nullmask_data(&result, 0); +if (nullmask[row]) { + printf("Data for row %d: NULL\n", row); +} else { + printf("Data for row %d: %d\n", row, data[row]); +} +``` + +* result: The result object to fetch the nullmask from. +* col: The column index. +* returns: The nullmask of the specified column. +*/ +DUCKDB_API bool* duckdb_nullmask_data(duckdb_result* result, idx_t col); + +/*! +Returns the error message contained within the result. The error is only set if `duckdb_query` returns `DuckDBError`. + +The result of this function must not be freed. It will be cleaned up when `duckdb_destroy_result` is called. + +* result: The result object to fetch the error from. +* returns: The error of the result. +*/ +DUCKDB_API const char* duckdb_result_error(duckdb_result* result); + +//===--------------------------------------------------------------------===// +// Result Functions +//===--------------------------------------------------------------------===// + +/*! +Fetches a data chunk from the duckdb_result. This function should be called repeatedly until the result is exhausted. + +The result must be destroyed with `duckdb_destroy_data_chunk`. + +This function supersedes all `duckdb_value` functions, as well as the `duckdb_column_data` and `duckdb_nullmask_data` +functions. It results in significantly better performance, and should be preferred in newer code-bases. + +If this function is used, none of the other result functions can be used and vice versa (i.e. this function cannot be +mixed with the legacy result functions). + +Use `duckdb_result_chunk_count` to figure out how many chunks there are in the result. + +* result: The result object to fetch the data chunk from. +* chunk_index: The chunk index to fetch from. +* returns: The resulting data chunk. Returns `NULL` if the chunk index is out of bounds. +*/ +DUCKDB_API duckdb_data_chunk duckdb_result_get_chunk(duckdb_result result, idx_t chunk_index); + +/*! +Returns the number of data chunks present in the result. + +* result: The result object +* returns: The resulting data chunk. Returns `NULL` if the chunk index is out of bounds. +*/ +DUCKDB_API idx_t duckdb_result_chunk_count(duckdb_result result); + +// Safe fetch functions +// These functions will perform conversions if necessary. +// On failure (e.g. if conversion cannot be performed or if the value is NULL) a default value is returned. +// Note that these functions are slow since they perform bounds checking and conversion +// For fast access of values prefer using `duckdb_result_get_chunk` + +/*! + * returns: The boolean value at the specified location, or false if the value cannot be converted. + */ +DUCKDB_API bool duckdb_value_boolean(duckdb_result* result, idx_t col, idx_t row); + +/*! + * returns: The int8_t value at the specified location, or 0 if the value cannot be converted. + */ +DUCKDB_API int8_t duckdb_value_int8(duckdb_result* result, idx_t col, idx_t row); + +/*! + * returns: The int16_t value at the specified location, or 0 if the value cannot be converted. + */ +DUCKDB_API int16_t duckdb_value_int16(duckdb_result* result, idx_t col, idx_t row); + +/*! + * returns: The int32_t value at the specified location, or 0 if the value cannot be converted. + */ +DUCKDB_API int32_t duckdb_value_int32(duckdb_result* result, idx_t col, idx_t row); + +/*! + * returns: The int64_t value at the specified location, or 0 if the value cannot be converted. + */ +DUCKDB_API int64_t duckdb_value_int64(duckdb_result* result, idx_t col, idx_t row); + +/*! + * returns: The duckdb_hugeint value at the specified location, or 0 if the value cannot be converted. + */ +DUCKDB_API duckdb_hugeint duckdb_value_hugeint(duckdb_result* result, idx_t col, idx_t row); + +/*! + * returns: The duckdb_decimal value at the specified location, or 0 if the value cannot be converted. + */ +DUCKDB_API duckdb_decimal duckdb_value_decimal(duckdb_result* result, idx_t col, idx_t row); + +/*! + * returns: The uint8_t value at the specified location, or 0 if the value cannot be converted. + */ +DUCKDB_API uint8_t duckdb_value_uint8(duckdb_result* result, idx_t col, idx_t row); + +/*! + * returns: The uint16_t value at the specified location, or 0 if the value cannot be converted. + */ +DUCKDB_API uint16_t duckdb_value_uint16(duckdb_result* result, idx_t col, idx_t row); + +/*! + * returns: The uint32_t value at the specified location, or 0 if the value cannot be converted. + */ +DUCKDB_API uint32_t duckdb_value_uint32(duckdb_result* result, idx_t col, idx_t row); + +/*! + * returns: The uint64_t value at the specified location, or 0 if the value cannot be converted. + */ +DUCKDB_API uint64_t duckdb_value_uint64(duckdb_result* result, idx_t col, idx_t row); + +/*! + * returns: The float value at the specified location, or 0 if the value cannot be converted. + */ +DUCKDB_API float duckdb_value_float(duckdb_result* result, idx_t col, idx_t row); + +/*! + * returns: The double value at the specified location, or 0 if the value cannot be converted. + */ +DUCKDB_API double duckdb_value_double(duckdb_result* result, idx_t col, idx_t row); + +/*! + * returns: The duckdb_date value at the specified location, or 0 if the value cannot be converted. + */ +DUCKDB_API duckdb_date duckdb_value_date(duckdb_result* result, idx_t col, idx_t row); + +/*! + * returns: The duckdb_time value at the specified location, or 0 if the value cannot be converted. + */ +DUCKDB_API duckdb_time duckdb_value_time(duckdb_result* result, idx_t col, idx_t row); + +/*! + * returns: The duckdb_timestamp value at the specified location, or 0 if the value cannot be converted. + */ +DUCKDB_API duckdb_timestamp duckdb_value_timestamp(duckdb_result* result, idx_t col, idx_t row); + +/*! + * returns: The duckdb_interval value at the specified location, or 0 if the value cannot be converted. + */ +DUCKDB_API duckdb_interval duckdb_value_interval(duckdb_result* result, idx_t col, idx_t row); + +/*! +* DEPRECATED: use duckdb_value_string instead. This function does not work correctly if the string contains null bytes. +* returns: The text value at the specified location as a null-terminated string, or nullptr if the value cannot be +converted. The result must be freed with `duckdb_free`. +*/ +DUCKDB_API char* duckdb_value_varchar(duckdb_result* result, idx_t col, idx_t row); + +/*!s +* returns: The string value at the specified location. +The result must be freed with `duckdb_free`. +*/ +DUCKDB_API duckdb_string duckdb_value_string(duckdb_result* result, idx_t col, idx_t row); + +/*! +* DEPRECATED: use duckdb_value_string_internal instead. This function does not work correctly if the string contains +null bytes. +* returns: The char* value at the specified location. ONLY works on VARCHAR columns and does not auto-cast. +If the column is NOT a VARCHAR column this function will return NULL. + +The result must NOT be freed. +*/ +DUCKDB_API char* duckdb_value_varchar_internal(duckdb_result* result, idx_t col, idx_t row); + +/*! +* DEPRECATED: use duckdb_value_string_internal instead. This function does not work correctly if the string contains +null bytes. +* returns: The char* value at the specified location. ONLY works on VARCHAR columns and does not auto-cast. +If the column is NOT a VARCHAR column this function will return NULL. + +The result must NOT be freed. +*/ +DUCKDB_API duckdb_string duckdb_value_string_internal(duckdb_result* result, idx_t col, idx_t row); + +/*! +* returns: The duckdb_blob value at the specified location. Returns a blob with blob.data set to nullptr if the +value cannot be converted. The resulting "blob.data" must be freed with `duckdb_free.` +*/ +DUCKDB_API duckdb_blob duckdb_value_blob(duckdb_result* result, idx_t col, idx_t row); + +/*! + * returns: Returns true if the value at the specified index is NULL, and false otherwise. + */ +DUCKDB_API bool duckdb_value_is_null(duckdb_result* result, idx_t col, idx_t row); + +//===--------------------------------------------------------------------===// +// Helpers +//===--------------------------------------------------------------------===// +/*! +Allocate `size` bytes of memory using the duckdb internal malloc function. Any memory allocated in this manner +should be freed using `duckdb_free`. + +* size: The number of bytes to allocate. +* returns: A pointer to the allocated memory region. +*/ +DUCKDB_API void* duckdb_malloc(size_t size); + +/*! +Free a value returned from `duckdb_malloc`, `duckdb_value_varchar` or `duckdb_value_blob`. + +* ptr: The memory region to de-allocate. +*/ +DUCKDB_API void duckdb_free(void* ptr); + +/*! +The internal vector size used by DuckDB. +This is the amount of tuples that will fit into a data chunk created by `duckdb_create_data_chunk`. + +* returns: The vector size. +*/ +DUCKDB_API idx_t duckdb_vector_size(); + +//===--------------------------------------------------------------------===// +// Date/Time/Timestamp Helpers +//===--------------------------------------------------------------------===// +/*! +Decompose a `duckdb_date` object into year, month and date (stored as `duckdb_date_struct`). + +* date: The date object, as obtained from a `DUCKDB_TYPE_DATE` column. +* returns: The `duckdb_date_struct` with the decomposed elements. +*/ +DUCKDB_API duckdb_date_struct duckdb_from_date(duckdb_date date); + +/*! +Re-compose a `duckdb_date` from year, month and date (`duckdb_date_struct`). + +* date: The year, month and date stored in a `duckdb_date_struct`. +* returns: The `duckdb_date` element. +*/ +DUCKDB_API duckdb_date duckdb_to_date(duckdb_date_struct date); + +/*! +Decompose a `duckdb_time` object into hour, minute, second and microsecond (stored as `duckdb_time_struct`). + +* time: The time object, as obtained from a `DUCKDB_TYPE_TIME` column. +* returns: The `duckdb_time_struct` with the decomposed elements. +*/ +DUCKDB_API duckdb_time_struct duckdb_from_time(duckdb_time time); + +/*! +Re-compose a `duckdb_time` from hour, minute, second and microsecond (`duckdb_time_struct`). + +* time: The hour, minute, second and microsecond in a `duckdb_time_struct`. +* returns: The `duckdb_time` element. +*/ +DUCKDB_API duckdb_time duckdb_to_time(duckdb_time_struct time); + +/*! +Decompose a `duckdb_timestamp` object into a `duckdb_timestamp_struct`. + +* ts: The ts object, as obtained from a `DUCKDB_TYPE_TIMESTAMP` column. +* returns: The `duckdb_timestamp_struct` with the decomposed elements. +*/ +DUCKDB_API duckdb_timestamp_struct duckdb_from_timestamp(duckdb_timestamp ts); + +/*! +Re-compose a `duckdb_timestamp` from a duckdb_timestamp_struct. + +* ts: The de-composed elements in a `duckdb_timestamp_struct`. +* returns: The `duckdb_timestamp` element. +*/ +DUCKDB_API duckdb_timestamp duckdb_to_timestamp(duckdb_timestamp_struct ts); + +//===--------------------------------------------------------------------===// +// Hugeint Helpers +//===--------------------------------------------------------------------===// +/*! +Converts a duckdb_hugeint object (as obtained from a `DUCKDB_TYPE_HUGEINT` column) into a double. + +* val: The hugeint value. +* returns: The converted `double` element. +*/ +DUCKDB_API double duckdb_hugeint_to_double(duckdb_hugeint val); + +/*! +Converts a double value to a duckdb_hugeint object. + +If the conversion fails because the double value is too big the result will be 0. + +* val: The double value. +* returns: The converted `duckdb_hugeint` element. +*/ +DUCKDB_API duckdb_hugeint duckdb_double_to_hugeint(double val); + +/*! +Converts a double value to a duckdb_decimal object. + +If the conversion fails because the double value is too big, or the width/scale are invalid the result will be 0. + +* val: The double value. +* returns: The converted `duckdb_decimal` element. +*/ +DUCKDB_API duckdb_decimal duckdb_double_to_decimal(double val, uint8_t width, uint8_t scale); + +//===--------------------------------------------------------------------===// +// Decimal Helpers +//===--------------------------------------------------------------------===// +/*! +Converts a duckdb_decimal object (as obtained from a `DUCKDB_TYPE_DECIMAL` column) into a double. + +* val: The decimal value. +* returns: The converted `double` element. +*/ +DUCKDB_API double duckdb_decimal_to_double(duckdb_decimal val); + +//===--------------------------------------------------------------------===// +// Prepared Statements +//===--------------------------------------------------------------------===// +// A prepared statement is a parameterized query that allows you to bind parameters to it. +// * This is useful to easily supply parameters to functions and avoid SQL injection attacks. +// * This is useful to speed up queries that you will execute several times with different parameters. +// Because the query will only be parsed, bound, optimized and planned once during the prepare stage, +// rather than once per execution. +// For example: +// SELECT * FROM tbl WHERE id=? +// Or a query with multiple parameters: +// SELECT * FROM tbl WHERE id=$1 OR name=$2 + +/*! +Create a prepared statement object from a query. + +Note that after calling `duckdb_prepare`, the prepared statement should always be destroyed using +`duckdb_destroy_prepare`, even if the prepare fails. + +If the prepare fails, `duckdb_prepare_error` can be called to obtain the reason why the prepare failed. + +* connection: The connection object +* query: The SQL query to prepare +* out_prepared_statement: The resulting prepared statement object +* returns: `DuckDBSuccess` on success or `DuckDBError` on failure. +*/ +DUCKDB_API duckdb_state duckdb_prepare(duckdb_connection connection, + const char* query, + duckdb_prepared_statement* out_prepared_statement); + +/*! +Closes the prepared statement and de-allocates all memory allocated for the statement. + +* prepared_statement: The prepared statement to destroy. +*/ +DUCKDB_API void duckdb_destroy_prepare(duckdb_prepared_statement* prepared_statement); + +/*! +Returns the error message associated with the given prepared statement. +If the prepared statement has no error message, this returns `nullptr` instead. + +The error message should not be freed. It will be de-allocated when `duckdb_destroy_prepare` is called. + +* prepared_statement: The prepared statement to obtain the error from. +* returns: The error message, or `nullptr` if there is none. +*/ +DUCKDB_API const char* duckdb_prepare_error(duckdb_prepared_statement prepared_statement); + +/*! +Returns the number of parameters that can be provided to the given prepared statement. + +Returns 0 if the query was not successfully prepared. + +* prepared_statement: The prepared statement to obtain the number of parameters for. +*/ +DUCKDB_API idx_t duckdb_nparams(duckdb_prepared_statement prepared_statement); + +/*! +Returns the parameter type for the parameter at the given index. + +Returns `DUCKDB_TYPE_INVALID` if the parameter index is out of range or the statement was not successfully prepared. + +* prepared_statement: The prepared statement. +* param_idx: The parameter index. +* returns: The parameter type +*/ +DUCKDB_API duckdb_type duckdb_param_type(duckdb_prepared_statement prepared_statement, idx_t param_idx); + +/*! +Clear the params bind to the prepared statement. +*/ +DUCKDB_API duckdb_state duckdb_clear_bindings(duckdb_prepared_statement prepared_statement); + +/*! +Binds a bool value to the prepared statement at the specified index. +*/ +DUCKDB_API duckdb_state duckdb_bind_boolean(duckdb_prepared_statement prepared_statement, idx_t param_idx, bool val); + +/*! +Binds an int8_t value to the prepared statement at the specified index. +*/ +DUCKDB_API duckdb_state duckdb_bind_int8(duckdb_prepared_statement prepared_statement, idx_t param_idx, int8_t val); + +/*! +Binds an int16_t value to the prepared statement at the specified index. +*/ +DUCKDB_API duckdb_state duckdb_bind_int16(duckdb_prepared_statement prepared_statement, idx_t param_idx, int16_t val); + +/*! +Binds an int32_t value to the prepared statement at the specified index. +*/ +DUCKDB_API duckdb_state duckdb_bind_int32(duckdb_prepared_statement prepared_statement, idx_t param_idx, int32_t val); + +/*! +Binds an int64_t value to the prepared statement at the specified index. +*/ +DUCKDB_API duckdb_state duckdb_bind_int64(duckdb_prepared_statement prepared_statement, idx_t param_idx, int64_t val); + +/*! +Binds an duckdb_hugeint value to the prepared statement at the specified index. +*/ +DUCKDB_API duckdb_state duckdb_bind_hugeint(duckdb_prepared_statement prepared_statement, + idx_t param_idx, + duckdb_hugeint val); +/*! +Binds a duckdb_decimal value to the prepared statement at the specified index. +*/ +DUCKDB_API duckdb_state duckdb_bind_decimal(duckdb_prepared_statement prepared_statement, + idx_t param_idx, + duckdb_decimal val); + +/*! +Binds an uint8_t value to the prepared statement at the specified index. +*/ +DUCKDB_API duckdb_state duckdb_bind_uint8(duckdb_prepared_statement prepared_statement, idx_t param_idx, uint8_t val); + +/*! +Binds an uint16_t value to the prepared statement at the specified index. +*/ +DUCKDB_API duckdb_state duckdb_bind_uint16(duckdb_prepared_statement prepared_statement, idx_t param_idx, uint16_t val); + +/*! +Binds an uint32_t value to the prepared statement at the specified index. +*/ +DUCKDB_API duckdb_state duckdb_bind_uint32(duckdb_prepared_statement prepared_statement, idx_t param_idx, uint32_t val); + +/*! +Binds an uint64_t value to the prepared statement at the specified index. +*/ +DUCKDB_API duckdb_state duckdb_bind_uint64(duckdb_prepared_statement prepared_statement, idx_t param_idx, uint64_t val); + +/*! +Binds an float value to the prepared statement at the specified index. +*/ +DUCKDB_API duckdb_state duckdb_bind_float(duckdb_prepared_statement prepared_statement, idx_t param_idx, float val); + +/*! +Binds an double value to the prepared statement at the specified index. +*/ +DUCKDB_API duckdb_state duckdb_bind_double(duckdb_prepared_statement prepared_statement, idx_t param_idx, double val); + +/*! +Binds a duckdb_date value to the prepared statement at the specified index. +*/ +DUCKDB_API duckdb_state duckdb_bind_date(duckdb_prepared_statement prepared_statement, + idx_t param_idx, + duckdb_date val); + +/*! +Binds a duckdb_time value to the prepared statement at the specified index. +*/ +DUCKDB_API duckdb_state duckdb_bind_time(duckdb_prepared_statement prepared_statement, + idx_t param_idx, + duckdb_time val); + +/*! +Binds a duckdb_timestamp value to the prepared statement at the specified index. +*/ +DUCKDB_API duckdb_state duckdb_bind_timestamp(duckdb_prepared_statement prepared_statement, + idx_t param_idx, + duckdb_timestamp val); + +/*! +Binds a duckdb_interval value to the prepared statement at the specified index. +*/ +DUCKDB_API duckdb_state duckdb_bind_interval(duckdb_prepared_statement prepared_statement, + idx_t param_idx, + duckdb_interval val); + +/*! +Binds a null-terminated varchar value to the prepared statement at the specified index. +*/ +DUCKDB_API duckdb_state duckdb_bind_varchar(duckdb_prepared_statement prepared_statement, + idx_t param_idx, + const char* val); + +/*! +Binds a varchar value to the prepared statement at the specified index. +*/ +DUCKDB_API duckdb_state duckdb_bind_varchar_length(duckdb_prepared_statement prepared_statement, + idx_t param_idx, + const char* val, + idx_t length); + +/*! +Binds a blob value to the prepared statement at the specified index. +*/ +DUCKDB_API duckdb_state duckdb_bind_blob(duckdb_prepared_statement prepared_statement, + idx_t param_idx, + const void* data, + idx_t length); + +/*! +Binds a NULL value to the prepared statement at the specified index. +*/ +DUCKDB_API duckdb_state duckdb_bind_null(duckdb_prepared_statement prepared_statement, idx_t param_idx); + +/*! +Executes the prepared statement with the given bound parameters, and returns a materialized query result. + +This method can be called multiple times for each prepared statement, and the parameters can be modified +between calls to this function. + +* prepared_statement: The prepared statement to execute. +* out_result: The query result. +* returns: `DuckDBSuccess` on success or `DuckDBError` on failure. +*/ +DUCKDB_API duckdb_state duckdb_execute_prepared(duckdb_prepared_statement prepared_statement, + duckdb_result* out_result); + +/*! +Executes the prepared statement with the given bound parameters, and returns an arrow query result. + +* prepared_statement: The prepared statement to execute. +* out_result: The query result. +* returns: `DuckDBSuccess` on success or `DuckDBError` on failure. +*/ +DUCKDB_API duckdb_state duckdb_execute_prepared_arrow(duckdb_prepared_statement prepared_statement, + duckdb_arrow* out_result); + +//===--------------------------------------------------------------------===// +// Extract Statements +//===--------------------------------------------------------------------===// +// A query string can be extracted into multiple SQL statements. Each statement can be prepared and executed separately. + +/*! +Extract all statements from a query. +Note that after calling `duckdb_extract_statements`, the extracted statements should always be destroyed using +`duckdb_destroy_extracted`, even if no statements were extracted. +If the extract fails, `duckdb_extract_statements_error` can be called to obtain the reason why the extract failed. +* connection: The connection object +* query: The SQL query to extract +* out_extracted_statements: The resulting extracted statements object +* returns: The number of extracted statements or 0 on failure. +*/ +DUCKDB_API idx_t duckdb_extract_statements(duckdb_connection connection, + const char* query, + duckdb_extracted_statements* out_extracted_statements); + +/*! +Prepare an extracted statement. +Note that after calling `duckdb_prepare_extracted_statement`, the prepared statement should always be destroyed using +`duckdb_destroy_prepare`, even if the prepare fails. +If the prepare fails, `duckdb_prepare_error` can be called to obtain the reason why the prepare failed. +* connection: The connection object +* extracted_statements: The extracted statements object +* index: The index of the extracted statement to prepare +* out_prepared_statement: The resulting prepared statement object +* returns: `DuckDBSuccess` on success or `DuckDBError` on failure. +*/ +DUCKDB_API duckdb_state duckdb_prepare_extracted_statement(duckdb_connection connection, + duckdb_extracted_statements extracted_statements, + idx_t index, + duckdb_prepared_statement* out_prepared_statement); +/*! +Returns the error message contained within the extracted statements. +The result of this function must not be freed. It will be cleaned up when `duckdb_destroy_extracted` is called. +* result: The extracted statements to fetch the error from. +* returns: The error of the extracted statements. +*/ +DUCKDB_API const char* duckdb_extract_statements_error(duckdb_extracted_statements extracted_statements); + +/*! +De-allocates all memory allocated for the extracted statements. +* extracted_statements: The extracted statements to destroy. +*/ +DUCKDB_API void duckdb_destroy_extracted(duckdb_extracted_statements* extracted_statements); + +//===--------------------------------------------------------------------===// +// Pending Result Interface +//===--------------------------------------------------------------------===// +/*! +Executes the prepared statement with the given bound parameters, and returns a pending result. +The pending result represents an intermediate structure for a query that is not yet fully executed. +The pending result can be used to incrementally execute a query, returning control to the client between tasks. + +Note that after calling `duckdb_pending_prepared`, the pending result should always be destroyed using +`duckdb_destroy_pending`, even if this function returns DuckDBError. + +* prepared_statement: The prepared statement to execute. +* out_result: The pending query result. +* returns: `DuckDBSuccess` on success or `DuckDBError` on failure. +*/ +DUCKDB_API duckdb_state duckdb_pending_prepared(duckdb_prepared_statement prepared_statement, + duckdb_pending_result* out_result); + +/*! +Closes the pending result and de-allocates all memory allocated for the result. + +* pending_result: The pending result to destroy. +*/ +DUCKDB_API void duckdb_destroy_pending(duckdb_pending_result* pending_result); + +/*! +Returns the error message contained within the pending result. + +The result of this function must not be freed. It will be cleaned up when `duckdb_destroy_pending` is called. + +* result: The pending result to fetch the error from. +* returns: The error of the pending result. +*/ +DUCKDB_API const char* duckdb_pending_error(duckdb_pending_result pending_result); + +/*! +Executes a single task within the query, returning whether or not the query is ready. + +If this returns DUCKDB_PENDING_RESULT_READY, the duckdb_execute_pending function can be called to obtain the result. +If this returns DUCKDB_PENDING_RESULT_NOT_READY, the duckdb_pending_execute_task function should be called again. +If this returns DUCKDB_PENDING_ERROR, an error occurred during execution. + +The error message can be obtained by calling duckdb_pending_error on the pending_result. + +* pending_result: The pending result to execute a task within.. +* returns: The state of the pending result after the execution. +*/ +DUCKDB_API duckdb_pending_state duckdb_pending_execute_task(duckdb_pending_result pending_result); + +/*! +Fully execute a pending query result, returning the final query result. + +If duckdb_pending_execute_task has been called until DUCKDB_PENDING_RESULT_READY was returned, this will return fast. +Otherwise, all remaining tasks must be executed first. + +* pending_result: The pending result to execute. +* out_result: The result object. +* returns: `DuckDBSuccess` on success or `DuckDBError` on failure. +*/ +DUCKDB_API duckdb_state duckdb_execute_pending(duckdb_pending_result pending_result, duckdb_result* out_result); + +//===--------------------------------------------------------------------===// +// Value Interface +//===--------------------------------------------------------------------===// +/*! +Destroys the value and de-allocates all memory allocated for that type. + +* value: The value to destroy. +*/ +DUCKDB_API void duckdb_destroy_value(duckdb_value* value); + +/*! +Creates a value from a null-terminated string + +* value: The null-terminated string +* returns: The value. This must be destroyed with `duckdb_destroy_value`. +*/ +DUCKDB_API duckdb_value duckdb_create_varchar(const char* text); + +/*! +Creates a value from a string + +* value: The text +* length: The length of the text +* returns: The value. This must be destroyed with `duckdb_destroy_value`. +*/ +DUCKDB_API duckdb_value duckdb_create_varchar_length(const char* text, idx_t length); + +/*! +Creates a value from an int64 + +* value: The bigint value +* returns: The value. This must be destroyed with `duckdb_destroy_value`. +*/ +DUCKDB_API duckdb_value duckdb_create_int64(int64_t val); + +/*! +Obtains a string representation of the given value. +The result must be destroyed with `duckdb_free`. + +* value: The value +* returns: The string value. This must be destroyed with `duckdb_free`. +*/ +DUCKDB_API char* duckdb_get_varchar(duckdb_value value); + +/*! +Obtains an int64 of the given value. + +* value: The value +* returns: The int64 value, or 0 if no conversion is possible +*/ +DUCKDB_API int64_t duckdb_get_int64(duckdb_value value); + +//===--------------------------------------------------------------------===// +// Logical Type Interface +//===--------------------------------------------------------------------===// + +/*! +Creates a `duckdb_logical_type` from a standard primitive type. +The resulting type should be destroyed with `duckdb_destroy_logical_type`. + +This should not be used with `DUCKDB_TYPE_DECIMAL`. + +* type: The primitive type to create. +* returns: The logical type. +*/ +DUCKDB_API duckdb_logical_type duckdb_create_logical_type(duckdb_type type); + +/*! +Creates a list type from its child type. +The resulting type should be destroyed with `duckdb_destroy_logical_type`. + +* type: The child type of list type to create. +* returns: The logical type. +*/ +DUCKDB_API duckdb_logical_type duckdb_create_list_type(duckdb_logical_type type); + +/*! +Creates a map type from its key type and value type. +The resulting type should be destroyed with `duckdb_destroy_logical_type`. + +* type: The key type and value type of map type to create. +* returns: The logical type. +*/ +DUCKDB_API duckdb_logical_type duckdb_create_map_type(duckdb_logical_type key_type, duckdb_logical_type value_type); + +/*! +Creates a UNION type from the passed types array +The resulting type should be destroyed with `duckdb_destroy_logical_type`. + +* types: The array of types that the union should consist of. +* type_amount: The size of the types array. +* returns: The logical type. +*/ +DUCKDB_API duckdb_logical_type duckdb_create_union_type(duckdb_logical_type member_types, + const char** member_names, + idx_t member_count); + +/*! +Creates a `duckdb_logical_type` of type decimal with the specified width and scale +The resulting type should be destroyed with `duckdb_destroy_logical_type`. + +* width: The width of the decimal type +* scale: The scale of the decimal type +* returns: The logical type. +*/ +DUCKDB_API duckdb_logical_type duckdb_create_decimal_type(uint8_t width, uint8_t scale); + +/*! +Retrieves the type class of a `duckdb_logical_type`. + +* type: The logical type object +* returns: The type id +*/ +DUCKDB_API duckdb_type duckdb_get_type_id(duckdb_logical_type type); + +/*! +Retrieves the width of a decimal type. + +* type: The logical type object +* returns: The width of the decimal type +*/ +DUCKDB_API uint8_t duckdb_decimal_width(duckdb_logical_type type); + +/*! +Retrieves the scale of a decimal type. + +* type: The logical type object +* returns: The scale of the decimal type +*/ +DUCKDB_API uint8_t duckdb_decimal_scale(duckdb_logical_type type); + +/*! +Retrieves the internal storage type of a decimal type. + +* type: The logical type object +* returns: The internal type of the decimal type +*/ +DUCKDB_API duckdb_type duckdb_decimal_internal_type(duckdb_logical_type type); + +/*! +Retrieves the internal storage type of an enum type. + +* type: The logical type object +* returns: The internal type of the enum type +*/ +DUCKDB_API duckdb_type duckdb_enum_internal_type(duckdb_logical_type type); + +/*! +Retrieves the dictionary size of the enum type + +* type: The logical type object +* returns: The dictionary size of the enum type +*/ +DUCKDB_API uint32_t duckdb_enum_dictionary_size(duckdb_logical_type type); + +/*! +Retrieves the dictionary value at the specified position from the enum. + +The result must be freed with `duckdb_free` + +* type: The logical type object +* index: The index in the dictionary +* returns: The string value of the enum type. Must be freed with `duckdb_free`. +*/ +DUCKDB_API char* duckdb_enum_dictionary_value(duckdb_logical_type type, idx_t index); + +/*! +Retrieves the child type of the given list type. + +The result must be freed with `duckdb_destroy_logical_type` + +* type: The logical type object +* returns: The child type of the list type. Must be destroyed with `duckdb_destroy_logical_type`. +*/ +DUCKDB_API duckdb_logical_type duckdb_list_type_child_type(duckdb_logical_type type); + +/*! +Retrieves the key type of the given map type. + +The result must be freed with `duckdb_destroy_logical_type` + +* type: The logical type object +* returns: The key type of the map type. Must be destroyed with `duckdb_destroy_logical_type`. +*/ +DUCKDB_API duckdb_logical_type duckdb_map_type_key_type(duckdb_logical_type type); + +/*! +Retrieves the value type of the given map type. + +The result must be freed with `duckdb_destroy_logical_type` + +* type: The logical type object +* returns: The value type of the map type. Must be destroyed with `duckdb_destroy_logical_type`. +*/ +DUCKDB_API duckdb_logical_type duckdb_map_type_value_type(duckdb_logical_type type); + +/*! +Returns the number of children of a struct type. + +* type: The logical type object +* returns: The number of children of a struct type. +*/ +DUCKDB_API idx_t duckdb_struct_type_child_count(duckdb_logical_type type); + +/*! +Retrieves the name of the struct child. + +The result must be freed with `duckdb_free` + +* type: The logical type object +* index: The child index +* returns: The name of the struct type. Must be freed with `duckdb_free`. +*/ +DUCKDB_API char* duckdb_struct_type_child_name(duckdb_logical_type type, idx_t index); + +/*! +Retrieves the child type of the given struct type at the specified index. + +The result must be freed with `duckdb_destroy_logical_type` + +* type: The logical type object +* index: The child index +* returns: The child type of the struct type. Must be destroyed with `duckdb_destroy_logical_type`. +*/ +DUCKDB_API duckdb_logical_type duckdb_struct_type_child_type(duckdb_logical_type type, idx_t index); + +/*! +Returns the number of members that the union type has. + +* type: The logical type (union) object +* returns: The number of members of a union type. +*/ +DUCKDB_API idx_t duckdb_union_type_member_count(duckdb_logical_type type); + +/*! +Retrieves the name of the union member. + +The result must be freed with `duckdb_free` + +* type: The logical type object +* index: The child index +* returns: The name of the union member. Must be freed with `duckdb_free`. +*/ +DUCKDB_API char* duckdb_union_type_member_name(duckdb_logical_type type, idx_t index); + +/*! +Retrieves the child type of the given union member at the specified index. + +The result must be freed with `duckdb_destroy_logical_type` + +* type: The logical type object +* index: The child index +* returns: The child type of the union member. Must be destroyed with `duckdb_destroy_logical_type`. +*/ +DUCKDB_API duckdb_logical_type duckdb_union_type_member_type(duckdb_logical_type type, idx_t index); + +/*! +Destroys the logical type and de-allocates all memory allocated for that type. + +* type: The logical type to destroy. +*/ +DUCKDB_API void duckdb_destroy_logical_type(duckdb_logical_type* type); + +//===--------------------------------------------------------------------===// +// Data Chunk Interface +//===--------------------------------------------------------------------===// +/*! +Creates an empty DataChunk with the specified set of types. + +* types: An array of types of the data chunk. +* column_count: The number of columns. +* returns: The data chunk. +*/ +DUCKDB_API duckdb_data_chunk duckdb_create_data_chunk(duckdb_logical_type* types, idx_t column_count); + +/*! +Destroys the data chunk and de-allocates all memory allocated for that chunk. + +* chunk: The data chunk to destroy. +*/ +DUCKDB_API void duckdb_destroy_data_chunk(duckdb_data_chunk* chunk); + +/*! +Resets a data chunk, clearing the validity masks and setting the cardinality of the data chunk to 0. + +* chunk: The data chunk to reset. +*/ +DUCKDB_API void duckdb_data_chunk_reset(duckdb_data_chunk chunk); + +/*! +Retrieves the number of columns in a data chunk. + +* chunk: The data chunk to get the data from +* returns: The number of columns in the data chunk +*/ +DUCKDB_API idx_t duckdb_data_chunk_get_column_count(duckdb_data_chunk chunk); + +/*! +Retrieves the vector at the specified column index in the data chunk. + +The pointer to the vector is valid for as long as the chunk is alive. +It does NOT need to be destroyed. + +* chunk: The data chunk to get the data from +* returns: The vector +*/ +DUCKDB_API duckdb_vector duckdb_data_chunk_get_vector(duckdb_data_chunk chunk, idx_t col_idx); + +/*! +Retrieves the current number of tuples in a data chunk. + +* chunk: The data chunk to get the data from +* returns: The number of tuples in the data chunk +*/ +DUCKDB_API idx_t duckdb_data_chunk_get_size(duckdb_data_chunk chunk); + +/*! +Sets the current number of tuples in a data chunk. + +* chunk: The data chunk to set the size in +* size: The number of tuples in the data chunk +*/ +DUCKDB_API void duckdb_data_chunk_set_size(duckdb_data_chunk chunk, idx_t size); + +//===--------------------------------------------------------------------===// +// Vector Interface +//===--------------------------------------------------------------------===// +/*! +Retrieves the column type of the specified vector. + +The result must be destroyed with `duckdb_destroy_logical_type`. + +* vector: The vector get the data from +* returns: The type of the vector +*/ +DUCKDB_API duckdb_logical_type duckdb_vector_get_column_type(duckdb_vector vector); + +/*! +Retrieves the data pointer of the vector. + +The data pointer can be used to read or write values from the vector. +How to read or write values depends on the type of the vector. + +* vector: The vector to get the data from +* returns: The data pointer +*/ +DUCKDB_API void* duckdb_vector_get_data(duckdb_vector vector); + +/*! +Retrieves the validity mask pointer of the specified vector. + +If all values are valid, this function MIGHT return NULL! + +The validity mask is a bitset that signifies null-ness within the data chunk. +It is a series of uint64_t values, where each uint64_t value contains validity for 64 tuples. +The bit is set to 1 if the value is valid (i.e. not NULL) or 0 if the value is invalid (i.e. NULL). + +Validity of a specific value can be obtained like this: + +idx_t entry_idx = row_idx / 64; +idx_t idx_in_entry = row_idx % 64; +bool is_valid = validity_mask[entry_idx] & (1 << idx_in_entry); + +Alternatively, the (slower) duckdb_validity_row_is_valid function can be used. + +* vector: The vector to get the data from +* returns: The pointer to the validity mask, or NULL if no validity mask is present +*/ +DUCKDB_API uint64_t* duckdb_vector_get_validity(duckdb_vector vector); + +/*! +Ensures the validity mask is writable by allocating it. + +After this function is called, `duckdb_vector_get_validity` will ALWAYS return non-NULL. +This allows null values to be written to the vector, regardless of whether a validity mask was present before. + +* vector: The vector to alter +*/ +DUCKDB_API void duckdb_vector_ensure_validity_writable(duckdb_vector vector); + +/*! +Assigns a string element in the vector at the specified location. + +* vector: The vector to alter +* index: The row position in the vector to assign the string to +* str: The null-terminated string +*/ +DUCKDB_API void duckdb_vector_assign_string_element(duckdb_vector vector, idx_t index, const char* str); + +/*! +Assigns a string element in the vector at the specified location. + +* vector: The vector to alter +* index: The row position in the vector to assign the string to +* str: The string +* str_len: The length of the string (in bytes) +*/ +DUCKDB_API void +duckdb_vector_assign_string_element_len(duckdb_vector vector, idx_t index, const char* str, idx_t str_len); + +/*! +Retrieves the child vector of a list vector. + +The resulting vector is valid as long as the parent vector is valid. + +* vector: The vector +* returns: The child vector +*/ +DUCKDB_API duckdb_vector duckdb_list_vector_get_child(duckdb_vector vector); + +/*! +Returns the size of the child vector of the list + +* vector: The vector +* returns: The size of the child list +*/ +DUCKDB_API idx_t duckdb_list_vector_get_size(duckdb_vector vector); + +/*! +Retrieves the child vector of a struct vector. + +The resulting vector is valid as long as the parent vector is valid. + +* vector: The vector +* index: The child index +* returns: The child vector +*/ +DUCKDB_API duckdb_vector duckdb_struct_vector_get_child(duckdb_vector vector, idx_t index); + +//===--------------------------------------------------------------------===// +// Validity Mask Functions +//===--------------------------------------------------------------------===// +/*! +Returns whether or not a row is valid (i.e. not NULL) in the given validity mask. + +* validity: The validity mask, as obtained through `duckdb_data_chunk_get_validity` +* row: The row index +* returns: true if the row is valid, false otherwise +*/ +DUCKDB_API bool duckdb_validity_row_is_valid(uint64_t* validity, idx_t row); + +/*! +In a validity mask, sets a specific row to either valid or invalid. + +Note that `duckdb_data_chunk_ensure_validity_writable` should be called before calling `duckdb_data_chunk_get_validity`, +to ensure that there is a validity mask to write to. + +* validity: The validity mask, as obtained through `duckdb_data_chunk_get_validity`. +* row: The row index +* valid: Whether or not to set the row to valid, or invalid +*/ +DUCKDB_API void duckdb_validity_set_row_validity(uint64_t* validity, idx_t row, bool valid); + +/*! +In a validity mask, sets a specific row to invalid. + +Equivalent to `duckdb_validity_set_row_validity` with valid set to false. + +* validity: The validity mask +* row: The row index +*/ +DUCKDB_API void duckdb_validity_set_row_invalid(uint64_t* validity, idx_t row); + +/*! +In a validity mask, sets a specific row to valid. + +Equivalent to `duckdb_validity_set_row_validity` with valid set to true. + +* validity: The validity mask +* row: The row index +*/ +DUCKDB_API void duckdb_validity_set_row_valid(uint64_t* validity, idx_t row); + +//===--------------------------------------------------------------------===// +// Table Functions +//===--------------------------------------------------------------------===// +typedef void* duckdb_table_function; +typedef void* duckdb_bind_info; +typedef void* duckdb_init_info; +typedef void* duckdb_function_info; + +typedef void (*duckdb_table_function_bind_t)(duckdb_bind_info info); +typedef void (*duckdb_table_function_init_t)(duckdb_init_info info); +typedef void (*duckdb_table_function_t)(duckdb_function_info info, duckdb_data_chunk output); +typedef void (*duckdb_delete_callback_t)(void* data); + +/*! +Creates a new empty table function. + +The return value should be destroyed with `duckdb_destroy_table_function`. + +* returns: The table function object. +*/ +DUCKDB_API duckdb_table_function duckdb_create_table_function(); + +/*! +Destroys the given table function object. + +* table_function: The table function to destroy +*/ +DUCKDB_API void duckdb_destroy_table_function(duckdb_table_function* table_function); + +/*! +Sets the name of the given table function. + +* table_function: The table function +* name: The name of the table function +*/ +DUCKDB_API void duckdb_table_function_set_name(duckdb_table_function table_function, const char* name); + +/*! +Adds a parameter to the table function. + +* table_function: The table function +* type: The type of the parameter to add. +*/ +DUCKDB_API void duckdb_table_function_add_parameter(duckdb_table_function table_function, duckdb_logical_type type); + +/*! +Assigns extra information to the table function that can be fetched during binding, etc. + +* table_function: The table function +* extra_info: The extra information +* destroy: The callback that will be called to destroy the bind data (if any) +*/ +DUCKDB_API void duckdb_table_function_set_extra_info(duckdb_table_function table_function, + void* extra_info, + duckdb_delete_callback_t destroy); + +/*! +Sets the bind function of the table function + +* table_function: The table function +* bind: The bind function +*/ +DUCKDB_API void duckdb_table_function_set_bind(duckdb_table_function table_function, duckdb_table_function_bind_t bind); + +/*! +Sets the init function of the table function + +* table_function: The table function +* init: The init function +*/ +DUCKDB_API void duckdb_table_function_set_init(duckdb_table_function table_function, duckdb_table_function_init_t init); + +/*! +Sets the thread-local init function of the table function + +* table_function: The table function +* init: The init function +*/ +DUCKDB_API void duckdb_table_function_set_local_init(duckdb_table_function table_function, + duckdb_table_function_init_t init); + +/*! +Sets the main function of the table function + +* table_function: The table function +* function: The function +*/ +DUCKDB_API void duckdb_table_function_set_function(duckdb_table_function table_function, + duckdb_table_function_t function); + +/*! +Sets whether or not the given table function supports projection pushdown. + +If this is set to true, the system will provide a list of all required columns in the `init` stage through +the `duckdb_init_get_column_count` and `duckdb_init_get_column_index` functions. +If this is set to false (the default), the system will expect all columns to be projected. + +* table_function: The table function +* pushdown: True if the table function supports projection pushdown, false otherwise. +*/ +DUCKDB_API void duckdb_table_function_supports_projection_pushdown(duckdb_table_function table_function, bool pushdown); + +/*! +Register the table function object within the given connection. + +The function requires at least a name, a bind function, an init function and a main function. + +If the function is incomplete or a function with this name already exists DuckDBError is returned. + +* con: The connection to register it in. +* function: The function pointer +* returns: Whether or not the registration was successful. +*/ +DUCKDB_API duckdb_state duckdb_register_table_function(duckdb_connection con, duckdb_table_function function); + +//===--------------------------------------------------------------------===// +// Table Function Bind +//===--------------------------------------------------------------------===// +/*! +Retrieves the extra info of the function as set in `duckdb_table_function_set_extra_info` + +* info: The info object +* returns: The extra info +*/ +DUCKDB_API void* duckdb_bind_get_extra_info(duckdb_bind_info info); + +/*! +Adds a result column to the output of the table function. + +* info: The info object +* name: The name of the column +* type: The logical type of the column +*/ +DUCKDB_API void duckdb_bind_add_result_column(duckdb_bind_info info, const char* name, duckdb_logical_type type); + +/*! +Retrieves the number of regular (non-named) parameters to the function. + +* info: The info object +* returns: The number of parameters +*/ +DUCKDB_API idx_t duckdb_bind_get_parameter_count(duckdb_bind_info info); + +/*! +Retrieves the parameter at the given index. + +The result must be destroyed with `duckdb_destroy_value`. + +* info: The info object +* index: The index of the parameter to get +* returns: The value of the parameter. Must be destroyed with `duckdb_destroy_value`. +*/ +DUCKDB_API duckdb_value duckdb_bind_get_parameter(duckdb_bind_info info, idx_t index); + +/*! +Sets the user-provided bind data in the bind object. This object can be retrieved again during execution. + +* info: The info object +* extra_data: The bind data object. +* destroy: The callback that will be called to destroy the bind data (if any) +*/ +DUCKDB_API void duckdb_bind_set_bind_data(duckdb_bind_info info, void* bind_data, duckdb_delete_callback_t destroy); + +/*! +Sets the cardinality estimate for the table function, used for optimization. + +* info: The bind data object. +* is_exact: Whether or not the cardinality estimate is exact, or an approximation +*/ +DUCKDB_API void duckdb_bind_set_cardinality(duckdb_bind_info info, idx_t cardinality, bool is_exact); + +/*! +Report that an error has occurred while calling bind. + +* info: The info object +* error: The error message +*/ +DUCKDB_API void duckdb_bind_set_error(duckdb_bind_info info, const char* error); + +//===--------------------------------------------------------------------===// +// Table Function Init +//===--------------------------------------------------------------------===// + +/*! +Retrieves the extra info of the function as set in `duckdb_table_function_set_extra_info` + +* info: The info object +* returns: The extra info +*/ +DUCKDB_API void* duckdb_init_get_extra_info(duckdb_init_info info); + +/*! +Gets the bind data set by `duckdb_bind_set_bind_data` during the bind. + +Note that the bind data should be considered as read-only. +For tracking state, use the init data instead. + +* info: The info object +* returns: The bind data object +*/ +DUCKDB_API void* duckdb_init_get_bind_data(duckdb_init_info info); + +/*! +Sets the user-provided init data in the init object. This object can be retrieved again during execution. + +* info: The info object +* extra_data: The init data object. +* destroy: The callback that will be called to destroy the init data (if any) +*/ +DUCKDB_API void duckdb_init_set_init_data(duckdb_init_info info, void* init_data, duckdb_delete_callback_t destroy); + +/*! +Returns the number of projected columns. + +This function must be used if projection pushdown is enabled to figure out which columns to emit. + +* info: The info object +* returns: The number of projected columns. +*/ +DUCKDB_API idx_t duckdb_init_get_column_count(duckdb_init_info info); + +/*! +Returns the column index of the projected column at the specified position. + +This function must be used if projection pushdown is enabled to figure out which columns to emit. + +* info: The info object +* column_index: The index at which to get the projected column index, from 0..duckdb_init_get_column_count(info) +* returns: The column index of the projected column. +*/ +DUCKDB_API idx_t duckdb_init_get_column_index(duckdb_init_info info, idx_t column_index); + +/*! +Sets how many threads can process this table function in parallel (default: 1) + +* info: The info object +* max_threads: The maximum amount of threads that can process this table function +*/ +DUCKDB_API void duckdb_init_set_max_threads(duckdb_init_info info, idx_t max_threads); + +/*! +Report that an error has occurred while calling init. + +* info: The info object +* error: The error message +*/ +DUCKDB_API void duckdb_init_set_error(duckdb_init_info info, const char* error); + +//===--------------------------------------------------------------------===// +// Table Function +//===--------------------------------------------------------------------===// + +/*! +Retrieves the extra info of the function as set in `duckdb_table_function_set_extra_info` + +* info: The info object +* returns: The extra info +*/ +DUCKDB_API void* duckdb_function_get_extra_info(duckdb_function_info info); +/*! +Gets the bind data set by `duckdb_bind_set_bind_data` during the bind. + +Note that the bind data should be considered as read-only. +For tracking state, use the init data instead. + +* info: The info object +* returns: The bind data object +*/ +DUCKDB_API void* duckdb_function_get_bind_data(duckdb_function_info info); + +/*! +Gets the init data set by `duckdb_init_set_init_data` during the init. + +* info: The info object +* returns: The init data object +*/ +DUCKDB_API void* duckdb_function_get_init_data(duckdb_function_info info); + +/*! +Gets the thread-local init data set by `duckdb_init_set_init_data` during the local_init. + +* info: The info object +* returns: The init data object +*/ +DUCKDB_API void* duckdb_function_get_local_init_data(duckdb_function_info info); + +/*! +Report that an error has occurred while executing the function. + +* info: The info object +* error: The error message +*/ +DUCKDB_API void duckdb_function_set_error(duckdb_function_info info, const char* error); + +//===--------------------------------------------------------------------===// +// Replacement Scans +//===--------------------------------------------------------------------===// +typedef void* duckdb_replacement_scan_info; + +typedef void (*duckdb_replacement_callback_t)(duckdb_replacement_scan_info info, const char* table_name, void* data); + +/*! +Add a replacement scan definition to the specified database + +* db: The database object to add the replacement scan to +* replacement: The replacement scan callback +* extra_data: Extra data that is passed back into the specified callback +* delete_callback: The delete callback to call on the extra data, if any +*/ +DUCKDB_API void duckdb_add_replacement_scan(duckdb_database db, + duckdb_replacement_callback_t replacement, + void* extra_data, + duckdb_delete_callback_t delete_callback); + +/*! +Sets the replacement function name to use. If this function is called in the replacement callback, + the replacement scan is performed. If it is not called, the replacement callback is not performed. + +* info: The info object +* function_name: The function name to substitute. +*/ +DUCKDB_API void duckdb_replacement_scan_set_function_name(duckdb_replacement_scan_info info, const char* function_name); + +/*! +Adds a parameter to the replacement scan function. + +* info: The info object +* parameter: The parameter to add. +*/ +DUCKDB_API void duckdb_replacement_scan_add_parameter(duckdb_replacement_scan_info info, duckdb_value parameter); + +/*! +Report that an error has occurred while executing the replacement scan. + +* info: The info object +* error: The error message +*/ +DUCKDB_API void duckdb_replacement_scan_set_error(duckdb_replacement_scan_info info, const char* error); + +//===--------------------------------------------------------------------===// +// Appender +//===--------------------------------------------------------------------===// + +// Appenders are the most efficient way of loading data into DuckDB from within the C interface, and are recommended for +// fast data loading. The appender is much faster than using prepared statements or individual `INSERT INTO` statements. + +// Appends are made in row-wise format. For every column, a `duckdb_append_[type]` call should be made, after which +// the row should be finished by calling `duckdb_appender_end_row`. After all rows have been appended, +// `duckdb_appender_destroy` should be used to finalize the appender and clean up the resulting memory. + +// Note that `duckdb_appender_destroy` should always be called on the resulting appender, even if the function returns +// `DuckDBError`. + +/*! +Creates an appender object. + +* connection: The connection context to create the appender in. +* schema: The schema of the table to append to, or `nullptr` for the default schema. +* table: The table name to append to. +* out_appender: The resulting appender object. +* returns: `DuckDBSuccess` on success or `DuckDBError` on failure. +*/ +DUCKDB_API duckdb_state duckdb_appender_create(duckdb_connection connection, + const char* schema, + const char* table, + duckdb_appender* out_appender); + +/*! +Returns the error message associated with the given appender. +If the appender has no error message, this returns `nullptr` instead. + +The error message should not be freed. It will be de-allocated when `duckdb_appender_destroy` is called. + +* appender: The appender to get the error from. +* returns: The error message, or `nullptr` if there is none. +*/ +DUCKDB_API const char* duckdb_appender_error(duckdb_appender appender); + +/*! +Flush the appender to the table, forcing the cache of the appender to be cleared and the data to be appended to the +base table. + +This should generally not be used unless you know what you are doing. Instead, call `duckdb_appender_destroy` when you +are done with the appender. + +* appender: The appender to flush. +* returns: `DuckDBSuccess` on success or `DuckDBError` on failure. +*/ +DUCKDB_API duckdb_state duckdb_appender_flush(duckdb_appender appender); + +/*! +Close the appender, flushing all intermediate state in the appender to the table and closing it for further appends. + +This is generally not necessary. Call `duckdb_appender_destroy` instead. + +* appender: The appender to flush and close. +* returns: `DuckDBSuccess` on success or `DuckDBError` on failure. +*/ +DUCKDB_API duckdb_state duckdb_appender_close(duckdb_appender appender); + +/*! +Close the appender and destroy it. Flushing all intermediate state in the appender to the table, and de-allocating +all memory associated with the appender. + +* appender: The appender to flush, close and destroy. +* returns: `DuckDBSuccess` on success or `DuckDBError` on failure. +*/ +DUCKDB_API duckdb_state duckdb_appender_destroy(duckdb_appender* appender); + +/*! +A nop function, provided for backwards compatibility reasons. Does nothing. Only `duckdb_appender_end_row` is required. +*/ +DUCKDB_API duckdb_state duckdb_appender_begin_row(duckdb_appender appender); + +/*! +Finish the current row of appends. After end_row is called, the next row can be appended. + +* appender: The appender. +* returns: `DuckDBSuccess` on success or `DuckDBError` on failure. +*/ +DUCKDB_API duckdb_state duckdb_appender_end_row(duckdb_appender appender); + +/*! +Append a bool value to the appender. +*/ +DUCKDB_API duckdb_state duckdb_append_bool(duckdb_appender appender, bool value); + +/*! +Append an int8_t value to the appender. +*/ +DUCKDB_API duckdb_state duckdb_append_int8(duckdb_appender appender, int8_t value); +/*! +Append an int16_t value to the appender. +*/ +DUCKDB_API duckdb_state duckdb_append_int16(duckdb_appender appender, int16_t value); +/*! +Append an int32_t value to the appender. +*/ +DUCKDB_API duckdb_state duckdb_append_int32(duckdb_appender appender, int32_t value); +/*! +Append an int64_t value to the appender. +*/ +DUCKDB_API duckdb_state duckdb_append_int64(duckdb_appender appender, int64_t value); +/*! +Append a duckdb_hugeint value to the appender. +*/ +DUCKDB_API duckdb_state duckdb_append_hugeint(duckdb_appender appender, duckdb_hugeint value); + +/*! +Append a uint8_t value to the appender. +*/ +DUCKDB_API duckdb_state duckdb_append_uint8(duckdb_appender appender, uint8_t value); +/*! +Append a uint16_t value to the appender. +*/ +DUCKDB_API duckdb_state duckdb_append_uint16(duckdb_appender appender, uint16_t value); +/*! +Append a uint32_t value to the appender. +*/ +DUCKDB_API duckdb_state duckdb_append_uint32(duckdb_appender appender, uint32_t value); +/*! +Append a uint64_t value to the appender. +*/ +DUCKDB_API duckdb_state duckdb_append_uint64(duckdb_appender appender, uint64_t value); + +/*! +Append a float value to the appender. +*/ +DUCKDB_API duckdb_state duckdb_append_float(duckdb_appender appender, float value); +/*! +Append a double value to the appender. +*/ +DUCKDB_API duckdb_state duckdb_append_double(duckdb_appender appender, double value); + +/*! +Append a duckdb_date value to the appender. +*/ +DUCKDB_API duckdb_state duckdb_append_date(duckdb_appender appender, duckdb_date value); +/*! +Append a duckdb_time value to the appender. +*/ +DUCKDB_API duckdb_state duckdb_append_time(duckdb_appender appender, duckdb_time value); +/*! +Append a duckdb_timestamp value to the appender. +*/ +DUCKDB_API duckdb_state duckdb_append_timestamp(duckdb_appender appender, duckdb_timestamp value); +/*! +Append a duckdb_interval value to the appender. +*/ +DUCKDB_API duckdb_state duckdb_append_interval(duckdb_appender appender, duckdb_interval value); + +/*! +Append a varchar value to the appender. +*/ +DUCKDB_API duckdb_state duckdb_append_varchar(duckdb_appender appender, const char* val); +/*! +Append a varchar value to the appender. +*/ +DUCKDB_API duckdb_state duckdb_append_varchar_length(duckdb_appender appender, const char* val, idx_t length); +/*! +Append a blob value to the appender. +*/ +DUCKDB_API duckdb_state duckdb_append_blob(duckdb_appender appender, const void* data, idx_t length); +/*! +Append a NULL value to the appender (of any type). +*/ +DUCKDB_API duckdb_state duckdb_append_null(duckdb_appender appender); + +/*! +Appends a pre-filled data chunk to the specified appender. + +The types of the data chunk must exactly match the types of the table, no casting is performed. +If the types do not match or the appender is in an invalid state, DuckDBError is returned. +If the append is successful, DuckDBSuccess is returned. + +* appender: The appender to append to. +* chunk: The data chunk to append. +* returns: The return state. +*/ +DUCKDB_API duckdb_state duckdb_append_data_chunk(duckdb_appender appender, duckdb_data_chunk chunk); + +//===--------------------------------------------------------------------===// +// Arrow Interface +//===--------------------------------------------------------------------===// +/*! +Executes a SQL query within a connection and stores the full (materialized) result in an arrow structure. +If the query fails to execute, DuckDBError is returned and the error message can be retrieved by calling +`duckdb_query_arrow_error`. + +Note that after running `duckdb_query_arrow`, `duckdb_destroy_arrow` must be called on the result object even if the +query fails, otherwise the error stored within the result will not be freed correctly. + +* connection: The connection to perform the query in. +* query: The SQL query to run. +* out_result: The query result. +* returns: `DuckDBSuccess` on success or `DuckDBError` on failure. +*/ +DUCKDB_API duckdb_state duckdb_query_arrow(duckdb_connection connection, const char* query, duckdb_arrow* out_result); + +/*! +Fetch the internal arrow schema from the arrow result. + +* result: The result to fetch the schema from. +* out_schema: The output schema. +* returns: `DuckDBSuccess` on success or `DuckDBError` on failure. +*/ +DUCKDB_API duckdb_state duckdb_query_arrow_schema(duckdb_arrow result, duckdb_arrow_schema* out_schema); + +/*! +Fetch an internal arrow array from the arrow result. + +This function can be called multiple time to get next chunks, which will free the previous out_array. +So consume the out_array before calling this function again. + +* result: The result to fetch the array from. +* out_array: The output array. +* returns: `DuckDBSuccess` on success or `DuckDBError` on failure. +*/ +DUCKDB_API duckdb_state duckdb_query_arrow_array(duckdb_arrow result, duckdb_arrow_array* out_array); + +/*! +Returns the number of columns present in a the arrow result object. + +* result: The result object. +* returns: The number of columns present in the result object. +*/ +DUCKDB_API idx_t duckdb_arrow_column_count(duckdb_arrow result); + +/*! +Returns the number of rows present in a the arrow result object. + +* result: The result object. +* returns: The number of rows present in the result object. +*/ +DUCKDB_API idx_t duckdb_arrow_row_count(duckdb_arrow result); + +/*! +Returns the number of rows changed by the query stored in the arrow result. This is relevant only for +INSERT/UPDATE/DELETE queries. For other queries the rows_changed will be 0. + +* result: The result object. +* returns: The number of rows changed. +*/ +DUCKDB_API idx_t duckdb_arrow_rows_changed(duckdb_arrow result); + +/*! +Returns the error message contained within the result. The error is only set if `duckdb_query_arrow` returns +`DuckDBError`. + +The error message should not be freed. It will be de-allocated when `duckdb_destroy_arrow` is called. + +* result: The result object to fetch the nullmask from. +* returns: The error of the result. +*/ +DUCKDB_API const char* duckdb_query_arrow_error(duckdb_arrow result); + +/*! +Closes the result and de-allocates all memory allocated for the arrow result. + +* result: The result to destroy. +*/ +DUCKDB_API void duckdb_destroy_arrow(duckdb_arrow* result); + +//===--------------------------------------------------------------------===// +// Threading Information +//===--------------------------------------------------------------------===// +typedef void* duckdb_task_state; + +/*! +Execute DuckDB tasks on this thread. + +Will return after `max_tasks` have been executed, or if there are no more tasks present. + +* database: The database object to execute tasks for +* max_tasks: The maximum amount of tasks to execute +*/ +DUCKDB_API void duckdb_execute_tasks(duckdb_database database, idx_t max_tasks); + +/*! +Creates a task state that can be used with duckdb_execute_tasks_state to execute tasks until + duckdb_finish_execution is called on the state. + +duckdb_destroy_state should be called on the result in order to free memory. + +* database: The database object to create the task state for +* returns: The task state that can be used with duckdb_execute_tasks_state. +*/ +DUCKDB_API duckdb_task_state duckdb_create_task_state(duckdb_database database); + +/*! +Execute DuckDB tasks on this thread. + +The thread will keep on executing tasks forever, until duckdb_finish_execution is called on the state. +Multiple threads can share the same duckdb_task_state. + +* state: The task state of the executor +*/ +DUCKDB_API void duckdb_execute_tasks_state(duckdb_task_state state); + +/*! +Execute DuckDB tasks on this thread. + +The thread will keep on executing tasks until either duckdb_finish_execution is called on the state, +max_tasks tasks have been executed or there are no more tasks to be executed. + +Multiple threads can share the same duckdb_task_state. + +* state: The task state of the executor +* max_tasks: The maximum amount of tasks to execute +* returns: The amount of tasks that have actually been executed +*/ +DUCKDB_API idx_t duckdb_execute_n_tasks_state(duckdb_task_state state, idx_t max_tasks); + +/*! +Finish execution on a specific task. + +* state: The task state to finish execution +*/ +DUCKDB_API void duckdb_finish_execution(duckdb_task_state state); + +/*! +Check if the provided duckdb_task_state has finished execution + +* state: The task state to inspect +* returns: Whether or not duckdb_finish_execution has been called on the task state +*/ +DUCKDB_API bool duckdb_task_state_is_finished(duckdb_task_state state); + +/*! +Destroys the task state returned from duckdb_create_task_state. + +Note that this should not be called while there is an active duckdb_execute_tasks_state running +on the task state. + +* state: The task state to clean up +*/ +DUCKDB_API void duckdb_destroy_task_state(duckdb_task_state state); + +#ifdef __cplusplus +} +#endif diff --git a/benchmarks/include/duckdb/exception.hpp b/benchmarks/include/duckdb/exception.hpp new file mode 100644 index 0000000..a0dfba2 --- /dev/null +++ b/benchmarks/include/duckdb/exception.hpp @@ -0,0 +1,354 @@ +#pragma once + +#include "assert.hpp" +#include "common.hpp" +#include "exception_format_value.hpp" +#include "vector.hpp" +#include + +namespace alp_bench { +enum class PhysicalType : uint8_t; +struct LogicalType; +struct hugeint_t; + +inline void assert_restrict_function( + void* left_start, void* left_end, void* right_start, void* right_end, const char* fname, int linenr) { + // assert that the two pointers do not overlap +#ifdef DEBUG + if (!(left_end <= right_start || right_end <= left_start)) { + printf("ASSERT RESTRICT FAILED: %s:%d\n", fname, linenr); + D_ASSERT(0); + } +#endif +} + +#define ASSERT_RESTRICT(left_start, left_end, right_start, right_end) \ + assert_restrict_function(left_start, left_end, right_start, right_end, __FILE__, __LINE__) + +//===--------------------------------------------------------------------===// +// Exception Types +//===--------------------------------------------------------------------===// + +enum class ExceptionType { + INVALID = 0, // invalid type + OUT_OF_RANGE = 1, // value out of range error + CONVERSION = 2, // conversion/casting error + UNKNOWN_TYPE = 3, // unknown type + DECIMAL = 4, // decimal related + MISMATCH_TYPE = 5, // type mismatch + DIVIDE_BY_ZERO = 6, // divide by 0 + OBJECT_SIZE = 7, // object size exceeded + INVALID_TYPE = 8, // incompatible for operation + SERIALIZATION = 9, // serialization + TRANSACTION = 10, // transaction management + NOT_IMPLEMENTED = 11, // method not implemented + EXPRESSION = 12, // expression parsing + CATALOG = 13, // catalog related + PARSER = 14, // parser related + PLANNER = 15, // planner related + SCHEDULER = 16, // scheduler related + EXECUTOR = 17, // executor related + CONSTRAINT = 18, // constraint related + INDEX = 19, // index related + STAT = 20, // stat related + CONNECTION = 21, // connection related + SYNTAX = 22, // syntax related + SETTINGS = 23, // settings related + BINDER = 24, // binder related + NETWORK = 25, // network related + OPTIMIZER = 26, // optimizer related + NULL_POINTER = 27, // nullptr exception + IO = 28, // IO exception + INTERRUPT = 29, // interrupt + FATAL = 30, // Fatal exceptions are non-recoverable, and render the entire DB in an unusable state + INTERNAL = 31, // Internal exceptions indicate something went wrong internally (i.e. bug in the code base) + INVALID_INPUT = 32, // Input or arguments error + OUT_OF_MEMORY = 33, // out of memory + PERMISSION = 34, // insufficient permissions + PARAMETER_NOT_RESOLVED = 35, // parameter types could not be resolved + PARAMETER_NOT_ALLOWED = 36, // parameter types not allowed + DEPENDENCY = 37 // dependency +}; + +class Exception : public std::exception { +public: + DUCKDB_API explicit Exception(const string& msg); + DUCKDB_API Exception(ExceptionType exception_type, const string& message); + + ExceptionType type; + +public: + DUCKDB_API const char* what() const noexcept override; + DUCKDB_API const string& RawMessage() const; + + DUCKDB_API static string ExceptionTypeToString(ExceptionType type); + [[noreturn]] DUCKDB_API static void ThrowAsTypeWithMessage(ExceptionType type, const string& message); + + template + static string ConstructMessage(const string& msg, Args... params) { + vector values; + return ConstructMessageRecursive(msg, values, params...); + } + + DUCKDB_API static string ConstructMessageRecursive(const string& msg, vector& values); + + template + static string + ConstructMessageRecursive(const string& msg, vector& values, T param, Args... params) { + values.push_back(ExceptionFormatValue::CreateFormatValue(param)); + return ConstructMessageRecursive(msg, values, params...); + } + + DUCKDB_API static bool UncaughtException(); + + DUCKDB_API static string GetStackTrace(int max_depth = 120); + +private: + string exception_message_; + string raw_message_; +}; + +//===--------------------------------------------------------------------===// +// Exception derived classes +//===--------------------------------------------------------------------===// + +//! Exceptions that are StandardExceptions do NOT invalidate the current transaction when thrown +class StandardException : public Exception { +public: + DUCKDB_API StandardException(ExceptionType exception_type, const string& message); +}; + +class CatalogException : public StandardException { +public: + DUCKDB_API explicit CatalogException(const string& msg); + + template + explicit CatalogException(const string& msg, Args... params) + : CatalogException(ConstructMessage(msg, params...)) {} +}; + +class ConnectionException : public StandardException { +public: + DUCKDB_API explicit ConnectionException(const string& msg); + + template + explicit ConnectionException(const string& msg, Args... params) + : ConnectionException(ConstructMessage(msg, params...)) {} +}; + +class ParserException : public StandardException { +public: + DUCKDB_API explicit ParserException(const string& msg); + + template + explicit ParserException(const string& msg, Args... params) + : ParserException(ConstructMessage(msg, params...)) {} +}; + +class PermissionException : public StandardException { +public: + DUCKDB_API explicit PermissionException(const string& msg); + + template + explicit PermissionException(const string& msg, Args... params) + : PermissionException(ConstructMessage(msg, params...)) {} +}; + +class BinderException : public StandardException { +public: + DUCKDB_API explicit BinderException(const string& msg); + + template + explicit BinderException(const string& msg, Args... params) + : BinderException(ConstructMessage(msg, params...)) {} +}; + +class ConversionException : public Exception { +public: + DUCKDB_API explicit ConversionException(const string& msg); + + template + explicit ConversionException(const string& msg, Args... params) + : ConversionException(ConstructMessage(msg, params...)) {} +}; + +class TransactionException : public Exception { +public: + DUCKDB_API explicit TransactionException(const string& msg); + + template + explicit TransactionException(const string& msg, Args... params) + : TransactionException(ConstructMessage(msg, params...)) {} +}; + +class NotImplementedException : public Exception { +public: + DUCKDB_API explicit NotImplementedException(const string& msg); + + template + explicit NotImplementedException(const string& msg, Args... params) + : NotImplementedException(ConstructMessage(msg, params...)) {} +}; + +class OutOfRangeException : public Exception { +public: + DUCKDB_API explicit OutOfRangeException(const string& msg); + + template + explicit OutOfRangeException(const string& msg, Args... params) + : OutOfRangeException(ConstructMessage(msg, params...)) {} +}; + +class OutOfMemoryException : public Exception { +public: + DUCKDB_API explicit OutOfMemoryException(const string& msg); + + template + explicit OutOfMemoryException(const string& msg, Args... params) + : OutOfMemoryException(ConstructMessage(msg, params...)) {} +}; + +class SyntaxException : public Exception { +public: + DUCKDB_API explicit SyntaxException(const string& msg); + + template + explicit SyntaxException(const string& msg, Args... params) + : SyntaxException(ConstructMessage(msg, params...)) {} +}; + +class ConstraintException : public Exception { +public: + DUCKDB_API explicit ConstraintException(const string& msg); + + template + explicit ConstraintException(const string& msg, Args... params) + : ConstraintException(ConstructMessage(msg, params...)) {} +}; + +class DependencyException : public Exception { +public: + DUCKDB_API explicit DependencyException(const string& msg); + + template + explicit DependencyException(const string& msg, Args... params) + : DependencyException(ConstructMessage(msg, params...)) {} +}; + +class IOException : public Exception { +public: + DUCKDB_API explicit IOException(const string& msg); + + template + explicit IOException(const string& msg, Args... params) + : IOException(ConstructMessage(msg, params...)) {} +}; + +class SerializationException : public Exception { +public: + DUCKDB_API explicit SerializationException(const string& msg); + + template + explicit SerializationException(const string& msg, Args... params) + : SerializationException(ConstructMessage(msg, params...)) {} +}; + +class SequenceException : public Exception { +public: + DUCKDB_API explicit SequenceException(const string& msg); + + template + explicit SequenceException(const string& msg, Args... params) + : SequenceException(ConstructMessage(msg, params...)) {} +}; + +class InterruptException : public Exception { +public: + DUCKDB_API InterruptException(); +}; + +class FatalException : public Exception { +public: + DUCKDB_API explicit FatalException(const string& msg) + : FatalException(ExceptionType::FATAL, msg) {} + template + explicit FatalException(const string& msg, Args... params) + : FatalException(ConstructMessage(msg, params...)) {} + +protected: + DUCKDB_API explicit FatalException(ExceptionType type, const string& msg); + template + explicit FatalException(ExceptionType type, const string& msg, Args... params) + : FatalException(type, ConstructMessage(msg, params...)) {} +}; + +class InternalException : public FatalException { +public: + DUCKDB_API explicit InternalException(const string& msg); + + template + explicit InternalException(const string& msg, Args... params) + : InternalException(ConstructMessage(msg, params...)) {} +}; + +class InvalidInputException : public Exception { +public: + DUCKDB_API explicit InvalidInputException(const string& msg); + + template + explicit InvalidInputException(const string& msg, Args... params) + : InvalidInputException(ConstructMessage(msg, params...)) {} +}; + +class CastException : public Exception { +public: + DUCKDB_API CastException(const PhysicalType origType, const PhysicalType newType); + DUCKDB_API CastException(const LogicalType& origType, const LogicalType& newType); + DUCKDB_API + CastException(const string& msg); //! Needed to be able to recreate the exception after it's been serialized +}; + +class InvalidTypeException : public Exception { +public: + DUCKDB_API InvalidTypeException(PhysicalType type, const string& msg); + DUCKDB_API InvalidTypeException(const LogicalType& type, const string& msg); + DUCKDB_API + InvalidTypeException(const string& msg); //! Needed to be able to recreate the exception after it's been serialized +}; + +class TypeMismatchException : public Exception { +public: + DUCKDB_API TypeMismatchException(const PhysicalType type_1, const PhysicalType type_2, const string& msg); + DUCKDB_API TypeMismatchException(const LogicalType& type_1, const LogicalType& type_2, const string& msg); + DUCKDB_API + TypeMismatchException(const string& msg); //! Needed to be able to recreate the exception after it's been serialized +}; + +class ValueOutOfRangeException : public Exception { +public: + DUCKDB_API ValueOutOfRangeException(const int64_t value, const PhysicalType origType, const PhysicalType newType); + DUCKDB_API ValueOutOfRangeException(const hugeint_t value, const PhysicalType origType, const PhysicalType newType); + DUCKDB_API ValueOutOfRangeException(const double value, const PhysicalType origType, const PhysicalType newType); + DUCKDB_API ValueOutOfRangeException(const PhysicalType varType, const idx_t length); + DUCKDB_API ValueOutOfRangeException( + const string& msg); //! Needed to be able to recreate the exception after it's been serialized +}; + +class ParameterNotAllowedException : public StandardException { +public: + DUCKDB_API explicit ParameterNotAllowedException(const string& msg); + + template + explicit ParameterNotAllowedException(const string& msg, Args... params) + : ParameterNotAllowedException(ConstructMessage(msg, params...)) {} +}; + +//! Special exception that should be thrown in the binder if parameter types could not be resolved +//! This will cause prepared statements to be forcibly rebound with the actual parameter values +//! This exception is fatal if thrown outside of the binder (i.e. it should never be thrown outside of the binder) +class ParameterNotResolvedException : public Exception { +public: + DUCKDB_API explicit ParameterNotResolvedException(); +}; + +} // namespace alp_bench diff --git a/benchmarks/include/duckdb/exception_format_value.hpp b/benchmarks/include/duckdb/exception_format_value.hpp new file mode 100644 index 0000000..fa70bbb --- /dev/null +++ b/benchmarks/include/duckdb/exception_format_value.hpp @@ -0,0 +1,51 @@ +#pragma once + +#include "common.hpp" +#include "types.hpp" + +namespace alp_bench { + +enum class ExceptionFormatValueType : uint8_t { + FORMAT_VALUE_TYPE_DOUBLE, + FORMAT_VALUE_TYPE_INTEGER, + FORMAT_VALUE_TYPE_STRING +}; + +struct ExceptionFormatValue { + DUCKDB_API ExceptionFormatValue(double dbl_val); // NOLINT + DUCKDB_API ExceptionFormatValue(int64_t int_val); // NOLINT + DUCKDB_API ExceptionFormatValue(string str_val); // NOLINT + DUCKDB_API ExceptionFormatValue(hugeint_t hg_val); // NOLINT + + ExceptionFormatValueType type; + + double dbl_val = 0; + int64_t int_val = 0; + string str_val; + +public: + template + static ExceptionFormatValue CreateFormatValue(T value) { + return int64_t(value); + } + static string Format(const string& msg, vector& values); +}; + +template <> +DUCKDB_API ExceptionFormatValue ExceptionFormatValue::CreateFormatValue(PhysicalType value); +template <> +DUCKDB_API ExceptionFormatValue ExceptionFormatValue::CreateFormatValue(LogicalType value); +template <> +DUCKDB_API ExceptionFormatValue ExceptionFormatValue::CreateFormatValue(float value); +template <> +DUCKDB_API ExceptionFormatValue ExceptionFormatValue::CreateFormatValue(double value); +template <> +DUCKDB_API ExceptionFormatValue ExceptionFormatValue::CreateFormatValue(string value); +template <> +DUCKDB_API ExceptionFormatValue ExceptionFormatValue::CreateFormatValue(const char* value); +template <> +DUCKDB_API ExceptionFormatValue ExceptionFormatValue::CreateFormatValue(char* value); +template <> +DUCKDB_API ExceptionFormatValue ExceptionFormatValue::CreateFormatValue(hugeint_t value); + +} // namespace alp_bench diff --git a/benchmarks/include/duckdb/fast_mem.hpp b/benchmarks/include/duckdb/fast_mem.hpp new file mode 100644 index 0000000..1ed2e84 --- /dev/null +++ b/benchmarks/include/duckdb/fast_mem.hpp @@ -0,0 +1,686 @@ +#pragma once + +#include "common.hpp" +#include "types.hpp" + +template +static inline void MemcpyFixed(void* dest, const void* src) { + memcpy(dest, src, SIZE); +} + +template +static inline int MemcmpFixed(const void* str1, const void* str2) { + return memcmp(str1, str2, SIZE); +} + +namespace alp_bench { + +//! This templated memcpy is significantly faster than std::memcpy, +//! but only when you are calling memcpy with a const size in a loop. +//! For instance `while () { memcpy(, , const_size); ... }` +static inline void FastMemcpy(void* dest, const void* src, const size_t size) { + // LCOV_EXCL_START + switch (size) { + case 0: + return; + case 1: + return MemcpyFixed<1>(dest, src); + case 2: + return MemcpyFixed<2>(dest, src); + case 3: + return MemcpyFixed<3>(dest, src); + case 4: + return MemcpyFixed<4>(dest, src); + case 5: + return MemcpyFixed<5>(dest, src); + case 6: + return MemcpyFixed<6>(dest, src); + case 7: + return MemcpyFixed<7>(dest, src); + case 8: + return MemcpyFixed<8>(dest, src); + case 9: + return MemcpyFixed<9>(dest, src); + case 10: + return MemcpyFixed<10>(dest, src); + case 11: + return MemcpyFixed<11>(dest, src); + case 12: + return MemcpyFixed<12>(dest, src); + case 13: + return MemcpyFixed<13>(dest, src); + case 14: + return MemcpyFixed<14>(dest, src); + case 15: + return MemcpyFixed<15>(dest, src); + case 16: + return MemcpyFixed<16>(dest, src); + case 17: + return MemcpyFixed<17>(dest, src); + case 18: + return MemcpyFixed<18>(dest, src); + case 19: + return MemcpyFixed<19>(dest, src); + case 20: + return MemcpyFixed<20>(dest, src); + case 21: + return MemcpyFixed<21>(dest, src); + case 22: + return MemcpyFixed<22>(dest, src); + case 23: + return MemcpyFixed<23>(dest, src); + case 24: + return MemcpyFixed<24>(dest, src); + case 25: + return MemcpyFixed<25>(dest, src); + case 26: + return MemcpyFixed<26>(dest, src); + case 27: + return MemcpyFixed<27>(dest, src); + case 28: + return MemcpyFixed<28>(dest, src); + case 29: + return MemcpyFixed<29>(dest, src); + case 30: + return MemcpyFixed<30>(dest, src); + case 31: + return MemcpyFixed<31>(dest, src); + case 32: + return MemcpyFixed<32>(dest, src); + case 33: + return MemcpyFixed<33>(dest, src); + case 34: + return MemcpyFixed<34>(dest, src); + case 35: + return MemcpyFixed<35>(dest, src); + case 36: + return MemcpyFixed<36>(dest, src); + case 37: + return MemcpyFixed<37>(dest, src); + case 38: + return MemcpyFixed<38>(dest, src); + case 39: + return MemcpyFixed<39>(dest, src); + case 40: + return MemcpyFixed<40>(dest, src); + case 41: + return MemcpyFixed<41>(dest, src); + case 42: + return MemcpyFixed<42>(dest, src); + case 43: + return MemcpyFixed<43>(dest, src); + case 44: + return MemcpyFixed<44>(dest, src); + case 45: + return MemcpyFixed<45>(dest, src); + case 46: + return MemcpyFixed<46>(dest, src); + case 47: + return MemcpyFixed<47>(dest, src); + case 48: + return MemcpyFixed<48>(dest, src); + case 49: + return MemcpyFixed<49>(dest, src); + case 50: + return MemcpyFixed<50>(dest, src); + case 51: + return MemcpyFixed<51>(dest, src); + case 52: + return MemcpyFixed<52>(dest, src); + case 53: + return MemcpyFixed<53>(dest, src); + case 54: + return MemcpyFixed<54>(dest, src); + case 55: + return MemcpyFixed<55>(dest, src); + case 56: + return MemcpyFixed<56>(dest, src); + case 57: + return MemcpyFixed<57>(dest, src); + case 58: + return MemcpyFixed<58>(dest, src); + case 59: + return MemcpyFixed<59>(dest, src); + case 60: + return MemcpyFixed<60>(dest, src); + case 61: + return MemcpyFixed<61>(dest, src); + case 62: + return MemcpyFixed<62>(dest, src); + case 63: + return MemcpyFixed<63>(dest, src); + case 64: + return MemcpyFixed<64>(dest, src); + case 65: + return MemcpyFixed<65>(dest, src); + case 66: + return MemcpyFixed<66>(dest, src); + case 67: + return MemcpyFixed<67>(dest, src); + case 68: + return MemcpyFixed<68>(dest, src); + case 69: + return MemcpyFixed<69>(dest, src); + case 70: + return MemcpyFixed<70>(dest, src); + case 71: + return MemcpyFixed<71>(dest, src); + case 72: + return MemcpyFixed<72>(dest, src); + case 73: + return MemcpyFixed<73>(dest, src); + case 74: + return MemcpyFixed<74>(dest, src); + case 75: + return MemcpyFixed<75>(dest, src); + case 76: + return MemcpyFixed<76>(dest, src); + case 77: + return MemcpyFixed<77>(dest, src); + case 78: + return MemcpyFixed<78>(dest, src); + case 79: + return MemcpyFixed<79>(dest, src); + case 80: + return MemcpyFixed<80>(dest, src); + case 81: + return MemcpyFixed<81>(dest, src); + case 82: + return MemcpyFixed<82>(dest, src); + case 83: + return MemcpyFixed<83>(dest, src); + case 84: + return MemcpyFixed<84>(dest, src); + case 85: + return MemcpyFixed<85>(dest, src); + case 86: + return MemcpyFixed<86>(dest, src); + case 87: + return MemcpyFixed<87>(dest, src); + case 88: + return MemcpyFixed<88>(dest, src); + case 89: + return MemcpyFixed<89>(dest, src); + case 90: + return MemcpyFixed<90>(dest, src); + case 91: + return MemcpyFixed<91>(dest, src); + case 92: + return MemcpyFixed<92>(dest, src); + case 93: + return MemcpyFixed<93>(dest, src); + case 94: + return MemcpyFixed<94>(dest, src); + case 95: + return MemcpyFixed<95>(dest, src); + case 96: + return MemcpyFixed<96>(dest, src); + case 97: + return MemcpyFixed<97>(dest, src); + case 98: + return MemcpyFixed<98>(dest, src); + case 99: + return MemcpyFixed<99>(dest, src); + case 100: + return MemcpyFixed<100>(dest, src); + case 101: + return MemcpyFixed<101>(dest, src); + case 102: + return MemcpyFixed<102>(dest, src); + case 103: + return MemcpyFixed<103>(dest, src); + case 104: + return MemcpyFixed<104>(dest, src); + case 105: + return MemcpyFixed<105>(dest, src); + case 106: + return MemcpyFixed<106>(dest, src); + case 107: + return MemcpyFixed<107>(dest, src); + case 108: + return MemcpyFixed<108>(dest, src); + case 109: + return MemcpyFixed<109>(dest, src); + case 110: + return MemcpyFixed<110>(dest, src); + case 111: + return MemcpyFixed<111>(dest, src); + case 112: + return MemcpyFixed<112>(dest, src); + case 113: + return MemcpyFixed<113>(dest, src); + case 114: + return MemcpyFixed<114>(dest, src); + case 115: + return MemcpyFixed<115>(dest, src); + case 116: + return MemcpyFixed<116>(dest, src); + case 117: + return MemcpyFixed<117>(dest, src); + case 118: + return MemcpyFixed<118>(dest, src); + case 119: + return MemcpyFixed<119>(dest, src); + case 120: + return MemcpyFixed<120>(dest, src); + case 121: + return MemcpyFixed<121>(dest, src); + case 122: + return MemcpyFixed<122>(dest, src); + case 123: + return MemcpyFixed<123>(dest, src); + case 124: + return MemcpyFixed<124>(dest, src); + case 125: + return MemcpyFixed<125>(dest, src); + case 126: + return MemcpyFixed<126>(dest, src); + case 127: + return MemcpyFixed<127>(dest, src); + case 128: + return MemcpyFixed<128>(dest, src); + case 129: + return MemcpyFixed<129>(dest, src); + case 130: + return MemcpyFixed<130>(dest, src); + case 131: + return MemcpyFixed<131>(dest, src); + case 132: + return MemcpyFixed<132>(dest, src); + case 133: + return MemcpyFixed<133>(dest, src); + case 134: + return MemcpyFixed<134>(dest, src); + case 135: + return MemcpyFixed<135>(dest, src); + case 136: + return MemcpyFixed<136>(dest, src); + case 137: + return MemcpyFixed<137>(dest, src); + case 138: + return MemcpyFixed<138>(dest, src); + case 139: + return MemcpyFixed<139>(dest, src); + case 140: + return MemcpyFixed<140>(dest, src); + case 141: + return MemcpyFixed<141>(dest, src); + case 142: + return MemcpyFixed<142>(dest, src); + case 143: + return MemcpyFixed<143>(dest, src); + case 144: + return MemcpyFixed<144>(dest, src); + case 145: + return MemcpyFixed<145>(dest, src); + case 146: + return MemcpyFixed<146>(dest, src); + case 147: + return MemcpyFixed<147>(dest, src); + case 148: + return MemcpyFixed<148>(dest, src); + case 149: + return MemcpyFixed<149>(dest, src); + case 150: + return MemcpyFixed<150>(dest, src); + case 151: + return MemcpyFixed<151>(dest, src); + case 152: + return MemcpyFixed<152>(dest, src); + case 153: + return MemcpyFixed<153>(dest, src); + case 154: + return MemcpyFixed<154>(dest, src); + case 155: + return MemcpyFixed<155>(dest, src); + case 156: + return MemcpyFixed<156>(dest, src); + case 157: + return MemcpyFixed<157>(dest, src); + case 158: + return MemcpyFixed<158>(dest, src); + case 159: + return MemcpyFixed<159>(dest, src); + case 160: + return MemcpyFixed<160>(dest, src); + case 161: + return MemcpyFixed<161>(dest, src); + case 162: + return MemcpyFixed<162>(dest, src); + case 163: + return MemcpyFixed<163>(dest, src); + case 164: + return MemcpyFixed<164>(dest, src); + case 165: + return MemcpyFixed<165>(dest, src); + case 166: + return MemcpyFixed<166>(dest, src); + case 167: + return MemcpyFixed<167>(dest, src); + case 168: + return MemcpyFixed<168>(dest, src); + case 169: + return MemcpyFixed<169>(dest, src); + case 170: + return MemcpyFixed<170>(dest, src); + case 171: + return MemcpyFixed<171>(dest, src); + case 172: + return MemcpyFixed<172>(dest, src); + case 173: + return MemcpyFixed<173>(dest, src); + case 174: + return MemcpyFixed<174>(dest, src); + case 175: + return MemcpyFixed<175>(dest, src); + case 176: + return MemcpyFixed<176>(dest, src); + case 177: + return MemcpyFixed<177>(dest, src); + case 178: + return MemcpyFixed<178>(dest, src); + case 179: + return MemcpyFixed<179>(dest, src); + case 180: + return MemcpyFixed<180>(dest, src); + case 181: + return MemcpyFixed<181>(dest, src); + case 182: + return MemcpyFixed<182>(dest, src); + case 183: + return MemcpyFixed<183>(dest, src); + case 184: + return MemcpyFixed<184>(dest, src); + case 185: + return MemcpyFixed<185>(dest, src); + case 186: + return MemcpyFixed<186>(dest, src); + case 187: + return MemcpyFixed<187>(dest, src); + case 188: + return MemcpyFixed<188>(dest, src); + case 189: + return MemcpyFixed<189>(dest, src); + case 190: + return MemcpyFixed<190>(dest, src); + case 191: + return MemcpyFixed<191>(dest, src); + case 192: + return MemcpyFixed<192>(dest, src); + case 193: + return MemcpyFixed<193>(dest, src); + case 194: + return MemcpyFixed<194>(dest, src); + case 195: + return MemcpyFixed<195>(dest, src); + case 196: + return MemcpyFixed<196>(dest, src); + case 197: + return MemcpyFixed<197>(dest, src); + case 198: + return MemcpyFixed<198>(dest, src); + case 199: + return MemcpyFixed<199>(dest, src); + case 200: + return MemcpyFixed<200>(dest, src); + case 201: + return MemcpyFixed<201>(dest, src); + case 202: + return MemcpyFixed<202>(dest, src); + case 203: + return MemcpyFixed<203>(dest, src); + case 204: + return MemcpyFixed<204>(dest, src); + case 205: + return MemcpyFixed<205>(dest, src); + case 206: + return MemcpyFixed<206>(dest, src); + case 207: + return MemcpyFixed<207>(dest, src); + case 208: + return MemcpyFixed<208>(dest, src); + case 209: + return MemcpyFixed<209>(dest, src); + case 210: + return MemcpyFixed<210>(dest, src); + case 211: + return MemcpyFixed<211>(dest, src); + case 212: + return MemcpyFixed<212>(dest, src); + case 213: + return MemcpyFixed<213>(dest, src); + case 214: + return MemcpyFixed<214>(dest, src); + case 215: + return MemcpyFixed<215>(dest, src); + case 216: + return MemcpyFixed<216>(dest, src); + case 217: + return MemcpyFixed<217>(dest, src); + case 218: + return MemcpyFixed<218>(dest, src); + case 219: + return MemcpyFixed<219>(dest, src); + case 220: + return MemcpyFixed<220>(dest, src); + case 221: + return MemcpyFixed<221>(dest, src); + case 222: + return MemcpyFixed<222>(dest, src); + case 223: + return MemcpyFixed<223>(dest, src); + case 224: + return MemcpyFixed<224>(dest, src); + case 225: + return MemcpyFixed<225>(dest, src); + case 226: + return MemcpyFixed<226>(dest, src); + case 227: + return MemcpyFixed<227>(dest, src); + case 228: + return MemcpyFixed<228>(dest, src); + case 229: + return MemcpyFixed<229>(dest, src); + case 230: + return MemcpyFixed<230>(dest, src); + case 231: + return MemcpyFixed<231>(dest, src); + case 232: + return MemcpyFixed<232>(dest, src); + case 233: + return MemcpyFixed<233>(dest, src); + case 234: + return MemcpyFixed<234>(dest, src); + case 235: + return MemcpyFixed<235>(dest, src); + case 236: + return MemcpyFixed<236>(dest, src); + case 237: + return MemcpyFixed<237>(dest, src); + case 238: + return MemcpyFixed<238>(dest, src); + case 239: + return MemcpyFixed<239>(dest, src); + case 240: + return MemcpyFixed<240>(dest, src); + case 241: + return MemcpyFixed<241>(dest, src); + case 242: + return MemcpyFixed<242>(dest, src); + case 243: + return MemcpyFixed<243>(dest, src); + case 244: + return MemcpyFixed<244>(dest, src); + case 245: + return MemcpyFixed<245>(dest, src); + case 246: + return MemcpyFixed<246>(dest, src); + case 247: + return MemcpyFixed<247>(dest, src); + case 248: + return MemcpyFixed<248>(dest, src); + case 249: + return MemcpyFixed<249>(dest, src); + case 250: + return MemcpyFixed<250>(dest, src); + case 251: + return MemcpyFixed<251>(dest, src); + case 252: + return MemcpyFixed<252>(dest, src); + case 253: + return MemcpyFixed<253>(dest, src); + case 254: + return MemcpyFixed<254>(dest, src); + case 255: + return MemcpyFixed<255>(dest, src); + case 256: + return MemcpyFixed<256>(dest, src); + default: + memcpy(dest, src, size); + } + // LCOV_EXCL_STOP +} + +//! This templated memcmp is significantly faster than std::memcmp, +//! but only when you are calling memcmp with a const size in a loop. +//! For instance `while () { memcmp(, , const_size); ... }` +static inline int FastMemcmp(const void* str1, const void* str2, const size_t size) { + // LCOV_EXCL_START + switch (size) { + case 0: + return 0; + case 1: + return MemcmpFixed<1>(str1, str2); + case 2: + return MemcmpFixed<2>(str1, str2); + case 3: + return MemcmpFixed<3>(str1, str2); + case 4: + return MemcmpFixed<4>(str1, str2); + case 5: + return MemcmpFixed<5>(str1, str2); + case 6: + return MemcmpFixed<6>(str1, str2); + case 7: + return MemcmpFixed<7>(str1, str2); + case 8: + return MemcmpFixed<8>(str1, str2); + case 9: + return MemcmpFixed<9>(str1, str2); + case 10: + return MemcmpFixed<10>(str1, str2); + case 11: + return MemcmpFixed<11>(str1, str2); + case 12: + return MemcmpFixed<12>(str1, str2); + case 13: + return MemcmpFixed<13>(str1, str2); + case 14: + return MemcmpFixed<14>(str1, str2); + case 15: + return MemcmpFixed<15>(str1, str2); + case 16: + return MemcmpFixed<16>(str1, str2); + case 17: + return MemcmpFixed<17>(str1, str2); + case 18: + return MemcmpFixed<18>(str1, str2); + case 19: + return MemcmpFixed<19>(str1, str2); + case 20: + return MemcmpFixed<20>(str1, str2); + case 21: + return MemcmpFixed<21>(str1, str2); + case 22: + return MemcmpFixed<22>(str1, str2); + case 23: + return MemcmpFixed<23>(str1, str2); + case 24: + return MemcmpFixed<24>(str1, str2); + case 25: + return MemcmpFixed<25>(str1, str2); + case 26: + return MemcmpFixed<26>(str1, str2); + case 27: + return MemcmpFixed<27>(str1, str2); + case 28: + return MemcmpFixed<28>(str1, str2); + case 29: + return MemcmpFixed<29>(str1, str2); + case 30: + return MemcmpFixed<30>(str1, str2); + case 31: + return MemcmpFixed<31>(str1, str2); + case 32: + return MemcmpFixed<32>(str1, str2); + case 33: + return MemcmpFixed<33>(str1, str2); + case 34: + return MemcmpFixed<34>(str1, str2); + case 35: + return MemcmpFixed<35>(str1, str2); + case 36: + return MemcmpFixed<36>(str1, str2); + case 37: + return MemcmpFixed<37>(str1, str2); + case 38: + return MemcmpFixed<38>(str1, str2); + case 39: + return MemcmpFixed<39>(str1, str2); + case 40: + return MemcmpFixed<40>(str1, str2); + case 41: + return MemcmpFixed<41>(str1, str2); + case 42: + return MemcmpFixed<42>(str1, str2); + case 43: + return MemcmpFixed<43>(str1, str2); + case 44: + return MemcmpFixed<44>(str1, str2); + case 45: + return MemcmpFixed<45>(str1, str2); + case 46: + return MemcmpFixed<46>(str1, str2); + case 47: + return MemcmpFixed<47>(str1, str2); + case 48: + return MemcmpFixed<48>(str1, str2); + case 49: + return MemcmpFixed<49>(str1, str2); + case 50: + return MemcmpFixed<50>(str1, str2); + case 51: + return MemcmpFixed<51>(str1, str2); + case 52: + return MemcmpFixed<52>(str1, str2); + case 53: + return MemcmpFixed<53>(str1, str2); + case 54: + return MemcmpFixed<54>(str1, str2); + case 55: + return MemcmpFixed<55>(str1, str2); + case 56: + return MemcmpFixed<56>(str1, str2); + case 57: + return MemcmpFixed<57>(str1, str2); + case 58: + return MemcmpFixed<58>(str1, str2); + case 59: + return MemcmpFixed<59>(str1, str2); + case 60: + return MemcmpFixed<60>(str1, str2); + case 61: + return MemcmpFixed<61>(str1, str2); + case 62: + return MemcmpFixed<62>(str1, str2); + case 63: + return MemcmpFixed<63>(str1, str2); + case 64: + return MemcmpFixed<64>(str1, str2); + default: + return memcmp(str1, str2, size); + } + // LCOV_EXCL_STOP +} + +} // namespace alp_bench diff --git a/benchmarks/include/duckdb/helper.hpp b/benchmarks/include/duckdb/helper.hpp new file mode 100644 index 0000000..9947fbc --- /dev/null +++ b/benchmarks/include/duckdb/helper.hpp @@ -0,0 +1,124 @@ +#pragma once + +#include "constants.hpp" +#include + +#ifdef _MSC_VER +#define suint64_t int64_t +#endif + +#if defined(_WIN32) || defined(_WIN64) +#define DUCKDB_WINDOWS +#elif defined(__unix__) || defined(__unix) || (defined(__APPLE__) && defined(__MACH__)) +#define DUCKDB_POSIX +#endif + +namespace alp_bench { + +// explicit fallthrough for switch_statementss +#ifndef __has_cpp_attribute // For backwards compatibility +#define __has_cpp_attribute(x) 0 +#endif +#if __has_cpp_attribute(clang::fallthrough) +#define DUCKDB_EXPLICIT_FALLTHROUGH [[clang::fallthrough]] +#elif __has_cpp_attribute(gnu::fallthrough) +#define DUCKDB_EXPLICIT_FALLTHROUGH [[gnu::fallthrough]] +#else +#define DUCKDB_EXPLICIT_FALLTHROUGH +#endif + +#if !defined(_MSC_VER) && (__cplusplus < 201402L) +template +unique_ptr make_unique(Args&&... args) { + return unique_ptr(new T(std::forward(args)...)); +} +#else // Visual Studio has make_unique +using std::make_unique; +#endif +template +unique_ptr make_unique_base(Args&&... args) { + return unique_ptr(new T(std::forward(args)...)); +} + +template +unique_ptr unique_ptr_cast(unique_ptr src) { + return unique_ptr(static_cast(src.release())); +} + +struct SharedConstructor { + template + static shared_ptr Create(ARGS&&... args) { + return make_shared(std::forward(args)...); + } +}; + +struct UniqueConstructor { + template + static unique_ptr Create(ARGS&&... args) { + return make_unique(std::forward(args)...); + } +}; + +#ifdef DUCKDB_DEBUG_MOVE +template +typename std::remove_reference::type&& move(T&& t) noexcept { + // the nonsensical sizeof check ensures this is never instantiated + static_assert(sizeof(T) == 0, "Use std::move instead of unqualified move or duckdb::move"); +} +#endif + +template +T MaxValue(T a, T b) { + return a > b ? a : b; +} + +template +T MinValue(T a, T b) { + return a < b ? a : b; +} + +template +T AbsValue(T a) { + return a < 0 ? -a : a; +} + +// Align value (ceiling) +template +static inline T AlignValue(T n) { + return ((n + (val - 1)) / val) * val; +} + +template +static inline bool ValueIsAligned(T n) { + return (n % val) == 0; +} + +template +T SignValue(T a) { + return a < 0 ? -1 : 1; +} + +template +const T Load(const_data_ptr_t ptr) { + T ret; + memcpy(&ret, ptr, sizeof(ret)); + return ret; +} + +template +void Store(const T val, data_ptr_t ptr) { + memcpy(ptr, (void*)&val, sizeof(val)); +} + +//! This assigns a shared pointer, but ONLY assigns if "target" is not equal to "source" +//! If this is often the case, this manner of assignment is significantly faster (~20X faster) +//! Since it avoids the need of an atomic incref/decref at the cost of a single pointer comparison +//! Benchmark: https://gist.github.com/Mytherin/4db3faa8e233c4a9b874b21f62bb4b96 +//! If the shared pointers are not the same, the penalty is very low (on the order of 1%~ slower) +//! This method should always be preferred if there is a (reasonable) chance that the pointers are the same +template +void AssignSharedPointer(shared_ptr& target, const shared_ptr& source) { + if (target.get() != source.get()) { target = source; } +} + +} // namespace alp_bench diff --git a/benchmarks/include/duckdb/likely.hpp b/benchmarks/include/duckdb/likely.hpp new file mode 100644 index 0000000..0a4da53 --- /dev/null +++ b/benchmarks/include/duckdb/likely.hpp @@ -0,0 +1,10 @@ +#pragma once + +#if __GNUC__ +#define DUCKDB_BUILTIN_EXPECT(cond, expected_value) (__builtin_expect(cond, expected_value)) +#else +#define DUCKDB_BUILTIN_EXPECT(cond, expected_value) (cond) +#endif + +#define DUCKDB_LIKELY(...) DUCKDB_BUILTIN_EXPECT((__VA_ARGS__), 1) +#define DUCKDB_UNLIKELY(...) DUCKDB_BUILTIN_EXPECT((__VA_ARGS__), 0) diff --git a/benchmarks/include/duckdb/limits.hpp b/benchmarks/include/duckdb/limits.hpp new file mode 100644 index 0000000..7e98635 --- /dev/null +++ b/benchmarks/include/duckdb/limits.hpp @@ -0,0 +1,100 @@ +#pragma once + +#include "types.hpp" + +// Undef annoying windows macro +#undef max + +#include + +namespace alp_bench { + +template +struct NumericLimits { + DUCKDB_API static constexpr T Minimum() { return std::numeric_limits::lowest(); }; + DUCKDB_API static constexpr T Maximum() { return std::numeric_limits::max(); }; + DUCKDB_API static bool IsSigned(); + DUCKDB_API static idx_t Digits(); +}; + +template <> +struct NumericLimits { + DUCKDB_API static constexpr int8_t Minimum() { return std::numeric_limits::lowest(); }; + DUCKDB_API static constexpr int8_t Maximum() { return std::numeric_limits::max(); }; + DUCKDB_API static bool IsSigned() { return true; } + DUCKDB_API static idx_t Digits() { return 3; } +}; +template <> +struct NumericLimits { + DUCKDB_API static constexpr int16_t Minimum() { return std::numeric_limits::lowest(); }; + DUCKDB_API static constexpr int16_t Maximum() { return std::numeric_limits::max(); }; + DUCKDB_API static bool IsSigned() { return true; } + DUCKDB_API static idx_t Digits() { return 5; } +}; +template <> +struct NumericLimits { + DUCKDB_API static constexpr int32_t Minimum() { return std::numeric_limits::lowest(); }; + DUCKDB_API static constexpr int32_t Maximum() { return std::numeric_limits::max(); }; + DUCKDB_API static bool IsSigned() { return true; } + DUCKDB_API static idx_t Digits() { return 10; } +}; +template <> +struct NumericLimits { + DUCKDB_API static constexpr int64_t Minimum() { return std::numeric_limits::lowest(); }; + DUCKDB_API static constexpr int64_t Maximum() { return std::numeric_limits::max(); }; + DUCKDB_API static bool IsSigned() { return true; } + DUCKDB_API static idx_t Digits() { return 19; } +}; +template <> +struct NumericLimits { + DUCKDB_API static constexpr hugeint_t Minimum() { return {std::numeric_limits::lowest(), 1}; }; + DUCKDB_API static constexpr hugeint_t Maximum() { + return {std::numeric_limits::max(), std::numeric_limits::max()}; + }; + DUCKDB_API static bool IsSigned() { return true; } + DUCKDB_API static idx_t Digits() { return 39; } +}; +template <> +struct NumericLimits { + DUCKDB_API static constexpr uint8_t Minimum() { return std::numeric_limits::lowest(); }; + DUCKDB_API static constexpr uint8_t Maximum() { return std::numeric_limits::max(); }; + DUCKDB_API static bool IsSigned() { return false; } + DUCKDB_API static idx_t Digits() { return 3; } +}; +template <> +struct NumericLimits { + DUCKDB_API static constexpr uint16_t Minimum() { return std::numeric_limits::lowest(); }; + DUCKDB_API static constexpr uint16_t Maximum() { return std::numeric_limits::max(); }; + DUCKDB_API static bool IsSigned() { return false; } + DUCKDB_API static idx_t Digits() { return 5; } +}; +template <> +struct NumericLimits { + DUCKDB_API static constexpr uint32_t Minimum() { return std::numeric_limits::lowest(); }; + DUCKDB_API static constexpr uint32_t Maximum() { return std::numeric_limits::max(); }; + DUCKDB_API static bool IsSigned() { return false; } + DUCKDB_API static idx_t Digits() { return 10; } +}; +template <> +struct NumericLimits { + DUCKDB_API static constexpr uint64_t Minimum() { return std::numeric_limits::lowest(); }; + DUCKDB_API static constexpr uint64_t Maximum() { return std::numeric_limits::max(); }; + DUCKDB_API static bool IsSigned() { return false; } + DUCKDB_API static idx_t Digits() { return 20; } +}; +template <> +struct NumericLimits { + DUCKDB_API static constexpr float Minimum() { return std::numeric_limits::lowest(); }; + DUCKDB_API static constexpr float Maximum() { return std::numeric_limits::max(); }; + DUCKDB_API static bool IsSigned() { return true; } + DUCKDB_API static idx_t Digits() { return 127; } +}; +template <> +struct NumericLimits { + DUCKDB_API static constexpr double Minimum() { return std::numeric_limits::lowest(); }; + DUCKDB_API static constexpr double Maximum() { return std::numeric_limits::max(); }; + DUCKDB_API static bool IsSigned() { return true; } + DUCKDB_API static idx_t Digits() { return 250; } +}; + +} // namespace alp_bench diff --git a/benchmarks/include/duckdb/single_thread_ptr.hpp b/benchmarks/include/duckdb/single_thread_ptr.hpp new file mode 100644 index 0000000..2116d35 --- /dev/null +++ b/benchmarks/include/duckdb/single_thread_ptr.hpp @@ -0,0 +1,164 @@ +#pragma once + +class RefCounter { +public: + uint32_t pn; + RefCounter() + : pn(1) {} + void inc() { ++pn; } + void dec() { --pn; } + uint32_t getPn() const { return pn; } + virtual ~RefCounter() {} +}; + +namespace alp_bench { +template +class single_thread_ptr { +public: + T* ptr; // contained pointer + RefCounter* ref_count; // reference counter + +public: + // Default constructor, constructs an empty single_thread_ptr. + constexpr single_thread_ptr() + : ptr(nullptr) + , ref_count(nullptr) {} + // Construct empty single_thread_ptr. + constexpr single_thread_ptr(std::nullptr_t) + : ptr(nullptr) + , ref_count(nullptr) {} + // Construct a single_thread_ptr that wraps raw pointer. + + single_thread_ptr(RefCounter* r, T* p) { + ptr = p; + ref_count = r; + } + + template + single_thread_ptr(RefCounter* r, U* p) { + ptr = p; + ref_count = r; + } + + // Copy constructor. + single_thread_ptr(const single_thread_ptr& sp) + : ptr(nullptr) + , ref_count(nullptr) { + if (sp.ptr) { + ptr = sp.ptr; + ref_count = sp.ref_count; + ref_count->inc(); + } + } + + // Conversion constructor. + template + single_thread_ptr(const single_thread_ptr& sp) + : ptr(nullptr) + , ref_count(nullptr) { + if (sp.ptr) { + ptr = sp.ptr; + ref_count = sp.ref_count; + ref_count->inc(); + } + } + + // move constructor. + single_thread_ptr(single_thread_ptr&& sp) noexcept + : ptr {sp.ptr} + , ref_count {sp.ref_count} { + sp.ptr = nullptr; + sp.ref_count = nullptr; + } + + // move constructor. + template + single_thread_ptr(single_thread_ptr&& sp) noexcept + : ptr {sp.ptr} + , ref_count {sp.ref_count} { + sp.ptr = nullptr; + sp.ref_count = nullptr; + } + + // No effect if single_thread_ptr is empty or use_count() > 1, otherwise release the resources. + ~single_thread_ptr() { release(); } + + void release() { + if (ptr && ref_count) { + ref_count->dec(); + if ((ref_count->getPn()) == 0) { delete ref_count; } + } + ref_count = nullptr; + ptr = nullptr; + } + + // Copy assignment. + single_thread_ptr& operator=(single_thread_ptr sp) noexcept { + std::swap(this->ptr, sp.ptr); + std::swap(this->ref_count, sp.ref_count); + return *this; + } + + // Dereference pointer to managed object. + T& operator*() const noexcept { return *ptr; } + T* operator->() const noexcept { return ptr; } + + // Return the contained pointer. + T* get() const noexcept { return ptr; } + + // Return use count (use count == 0 if single_thread_ptr is empty). + long use_count() const noexcept { + if (ptr) + return ref_count->getPn(); + else + return 0; + } + + // Check if there is an associated managed object. + explicit operator bool() const noexcept { return (ptr); } + + // Resets single_thread_ptr to empty. + void reset() noexcept { release(); } +}; + +template +struct _object_and_block : public RefCounter { + T object; + + template + explicit _object_and_block(Args&&... args) + : object(std::forward(args)...) {} +}; + +// Operator overloading. +template +inline bool operator==(const single_thread_ptr& sp1, const single_thread_ptr& sp2) { + return sp1.get() == sp2.get(); +} + +template +inline bool operator==(const single_thread_ptr& sp, std::nullptr_t) noexcept { + return !sp; +} + +template +inline bool operator!=(const single_thread_ptr& sp1, const single_thread_ptr& sp2) { + return sp1.get() != sp2.get(); +} + +template +inline bool operator!=(const single_thread_ptr& sp, std::nullptr_t) noexcept { + return sp.get(); +} + +template +inline bool operator!=(std::nullptr_t, const single_thread_ptr& sp) noexcept { + return sp.get(); +} + +template +single_thread_ptr single_thread_make_shared(Args&&... args) { + auto tmp_object = new _object_and_block(std::forward(args)...); + return single_thread_ptr(tmp_object, &(tmp_object->object)); +} +} // namespace alp_bench \ No newline at end of file diff --git a/benchmarks/include/duckdb/string.hpp b/benchmarks/include/duckdb/string.hpp new file mode 100644 index 0000000..5ddc479 --- /dev/null +++ b/benchmarks/include/duckdb/string.hpp @@ -0,0 +1,8 @@ +#pragma once + +#include +#include + +namespace alp_bench { +using std::string; +} diff --git a/benchmarks/include/duckdb/to_string.hpp b/benchmarks/include/duckdb/to_string.hpp new file mode 100644 index 0000000..89b3b87 --- /dev/null +++ b/benchmarks/include/duckdb/to_string.hpp @@ -0,0 +1,5 @@ +#pragma once + +namespace lwcbench { +using std::to_string; +} diff --git a/benchmarks/include/duckdb/types.hpp b/benchmarks/include/duckdb/types.hpp new file mode 100644 index 0000000..15d7181 --- /dev/null +++ b/benchmarks/include/duckdb/types.hpp @@ -0,0 +1,487 @@ +#pragma once + +#include "assert.hpp" +#include "constants.hpp" +#include "single_thread_ptr.hpp" +#include "vector.hpp" +#include + +namespace alp_bench { + +class Serializer; +class Deserializer; +class Value; +class TypeCatalogEntry; +class Vector; +class ClientContext; + +struct hugeint_t { +public: + uint64_t lower; + int64_t upper; + +public: + DUCKDB_API hugeint_t() = default; + DUCKDB_API hugeint_t(int64_t value); // NOLINT: Allow implicit conversion from `int64_t` + DUCKDB_API constexpr hugeint_t(int64_t upper, uint64_t lower) + : lower(lower) + , upper(upper) {}; + DUCKDB_API constexpr hugeint_t(const hugeint_t& rhs) = default; + DUCKDB_API constexpr hugeint_t(hugeint_t&& rhs) = default; + DUCKDB_API hugeint_t& operator=(const hugeint_t& rhs) = default; + DUCKDB_API hugeint_t& operator=(hugeint_t&& rhs) = default; + + DUCKDB_API string ToString() const; + + // comparison operators + DUCKDB_API bool operator==(const hugeint_t& rhs) const; + DUCKDB_API bool operator!=(const hugeint_t& rhs) const; + DUCKDB_API bool operator<=(const hugeint_t& rhs) const; + DUCKDB_API bool operator<(const hugeint_t& rhs) const; + DUCKDB_API bool operator>(const hugeint_t& rhs) const; + DUCKDB_API bool operator>=(const hugeint_t& rhs) const; + + // arithmetic operators + DUCKDB_API hugeint_t operator+(const hugeint_t& rhs) const; + DUCKDB_API hugeint_t operator-(const hugeint_t& rhs) const; + DUCKDB_API hugeint_t operator*(const hugeint_t& rhs) const; + DUCKDB_API hugeint_t operator/(const hugeint_t& rhs) const; + DUCKDB_API hugeint_t operator%(const hugeint_t& rhs) const; + DUCKDB_API hugeint_t operator-() const; + + // bitwise operators + DUCKDB_API hugeint_t operator>>(const hugeint_t& rhs) const; + DUCKDB_API hugeint_t operator<<(const hugeint_t& rhs) const; + DUCKDB_API hugeint_t operator&(const hugeint_t& rhs) const; + DUCKDB_API hugeint_t operator|(const hugeint_t& rhs) const; + DUCKDB_API hugeint_t operator^(const hugeint_t& rhs) const; + DUCKDB_API hugeint_t operator~() const; + + // in-place operators + DUCKDB_API hugeint_t& operator+=(const hugeint_t& rhs); + DUCKDB_API hugeint_t& operator-=(const hugeint_t& rhs); + DUCKDB_API hugeint_t& operator*=(const hugeint_t& rhs); + DUCKDB_API hugeint_t& operator/=(const hugeint_t& rhs); + DUCKDB_API hugeint_t& operator%=(const hugeint_t& rhs); + DUCKDB_API hugeint_t& operator>>=(const hugeint_t& rhs); + DUCKDB_API hugeint_t& operator<<=(const hugeint_t& rhs); + DUCKDB_API hugeint_t& operator&=(const hugeint_t& rhs); + DUCKDB_API hugeint_t& operator|=(const hugeint_t& rhs); + DUCKDB_API hugeint_t& operator^=(const hugeint_t& rhs); +}; + +struct string_t; + +template +using child_list_t = std::vector>; +//! FIXME: this should be a single_thread_ptr +template +using buffer_ptr = shared_ptr; + +template +buffer_ptr make_buffer(Args&&... args) { + return make_shared(std::forward(args)...); +} + +struct list_entry_t { + list_entry_t() = default; + list_entry_t(uint64_t offset, uint64_t length) + : offset(offset) + , length(length) {} + + uint64_t offset; + uint64_t length; +}; + +using union_tag_t = uint8_t; + +//===--------------------------------------------------------------------===// +// Internal Types +//===--------------------------------------------------------------------===// + +// taken from arrow's type.h +enum class PhysicalType : uint8_t { + ///// A NULL type having no physical storage + // NA = 0, + + /// Boolean as 8 bit "bool" value + BOOL = 1, + + /// Unsigned 8-bit little-endian integer + UINT8 = 2, + + /// Signed 8-bit little-endian integer + INT8 = 3, + + /// Unsigned 16-bit little-endian integer + UINT16 = 4, + + /// Signed 16-bit little-endian integer + INT16 = 5, + + /// Unsigned 32-bit little-endian integer + UINT32 = 6, + + /// Signed 32-bit little-endian integer + INT32 = 7, + + /// Unsigned 64-bit little-endian integer + UINT64 = 8, + + /// Signed 64-bit little-endian integer + INT64 = 9, + + ///// 2-byte floating point value + // HALF_FLOAT = 10, + + /// 4-byte floating point value + FLOAT = 11, + + /// 8-byte floating point value + DOUBLE = 12, + + ///// UTF8 variable-length string as List + // STRING = 13, + + ///// Variable-length bytes (no guarantee of UTF8-ness) + // BINARY = 14, + + ///// Fixed-size binary. Each value occupies the same number of bytes + // FIXED_SIZE_BINARY = 15, + + ///// int32_t days since the UNIX epoch + // DATE32 = 16, + + ///// int64_t milliseconds since the UNIX epoch + // DATE64 = 17, + + ///// Exact timestamp encoded with int64 since UNIX epoch + ///// Default unit millisecond + // TIMESTAMP = 18, + + ///// Time as signed 32-bit integer, representing either seconds or + ///// milliseconds since midnight + // TIME32 = 19, + + ///// Time as signed 64-bit integer, representing either microseconds or + ///// nanoseconds since midnight + // TIME64 = 20, + + /// YEAR_MONTH or DAY_TIME interval in SQL style + INTERVAL = 21, + + /// Precision- and scale-based decimal type. Storage type depends on the + /// parameters. + // DECIMAL = 22, + + /// A list of some logical data type + LIST = 23, + + /// Struct of logical types + STRUCT = 24, + + ///// Unions of logical types + // UNION = 25, + + ///// Dictionary-encoded type, also called "categorical" or "factor" + ///// in other programming languages. Holds the dictionary value + ///// type but not the dictionary itself, which is part of the + ///// ArrayData struct + // DICTIONARY = 26, + + ///// Custom data type, implemented by user + // EXTENSION = 28, + + ///// Fixed size list of some logical type + // FIXED_SIZE_LIST = 29, + + ///// Measure of elapsed time in either seconds, milliseconds, microseconds + ///// or nanoseconds. + // DURATION = 30, + + ///// Like STRING, but with 64-bit offsets + // LARGE_STRING = 31, + + ///// Like BINARY, but with 64-bit offsets + // LARGE_BINARY = 32, + + ///// Like LIST, but with 64-bit offsets + // LARGE_LIST = 33, + + /// DuckDB Extensions + VARCHAR = 200, // our own string representation, different from STRING and LARGE_STRING above + INT128 = 204, // 128-bit integers + UNKNOWN = 205, // Unknown physical type of user defined types + /// Boolean as 1 bit, LSB bit-packed ordering + BIT = 206, + + INVALID = 255 +}; + +//===--------------------------------------------------------------------===// +// SQL Types +//===--------------------------------------------------------------------===// +enum class LogicalTypeId : uint8_t { + INVALID = 0, + SQLNULL = 1, /* NULL type, used for constant NULL */ + UNKNOWN = 2, /* unknown type, used for parameter expressions */ + ANY = 3, /* ANY type, used for functions that accept any type as parameter */ + USER = 4, /* A User Defined Type (e.g., ENUMs before the binder) */ + BOOLEAN = 10, + TINYINT = 11, + SMALLINT = 12, + INTEGER = 13, + BIGINT = 14, + DATE = 15, + TIME = 16, + TIMESTAMP_SEC = 17, + TIMESTAMP_MS = 18, + TIMESTAMP = 19, //! us + TIMESTAMP_NS = 20, + DECIMAL = 21, + FLOAT = 22, + DOUBLE = 23, + CHAR = 24, + VARCHAR = 25, + BLOB = 26, + INTERVAL = 27, + UTINYINT = 28, + USMALLINT = 29, + UINTEGER = 30, + UBIGINT = 31, + TIMESTAMP_TZ = 32, + TIME_TZ = 34, + + HUGEINT = 50, + POINTER = 51, + // HASH = 52, // deprecated, uses UBIGINT instead + VALIDITY = 53, + UUID = 54, + + STRUCT = 100, + LIST = 101, + MAP = 102, + TABLE = 103, + ENUM = 104, + AGGREGATE_STATE = 105, + LAMBDA = 106, + UNION = 107 +}; + +struct ExtraTypeInfo; + +struct aggregate_state_t; + +struct LogicalType { + DUCKDB_API LogicalType(); + DUCKDB_API LogicalType(LogicalTypeId id); // NOLINT: Allow implicit conversion from `LogicalTypeId` + DUCKDB_API LogicalType(LogicalTypeId id, shared_ptr type_info); + DUCKDB_API LogicalType(const LogicalType& other); + DUCKDB_API LogicalType(LogicalType&& other) noexcept; + + DUCKDB_API ~LogicalType(); + + inline LogicalTypeId id() const { return id_; } + inline PhysicalType InternalType() const { return physical_type_; } + inline const ExtraTypeInfo* AuxInfo() const { return type_info_.get(); } + inline void CopyAuxInfo(const LogicalType& other) { type_info_ = other.type_info_; } + bool EqualTypeInfo(const LogicalType& rhs) const; + + // copy assignment + inline LogicalType& operator=(const LogicalType& other) { + id_ = other.id_; + physical_type_ = other.physical_type_; + type_info_ = other.type_info_; + return *this; + } + // move assignment + inline LogicalType& operator=(LogicalType&& other) noexcept { + id_ = other.id_; + physical_type_ = other.physical_type_; + type_info_ = std::move(other.type_info_); + return *this; + } + + DUCKDB_API bool operator==(const LogicalType& rhs) const; + inline bool operator!=(const LogicalType& rhs) const { return !(*this == rhs); } + + //! Serializes a LogicalType to a stand-alone binary blob + DUCKDB_API void Serialize(Serializer& serializer) const; + //! Deserializes a blob back into an LogicalType + DUCKDB_API static LogicalType Deserialize(Deserializer& source); + + DUCKDB_API static bool TypeIsTimestamp(LogicalTypeId id) { + return (id == LogicalTypeId::TIMESTAMP || id == LogicalTypeId::TIMESTAMP_MS || + id == LogicalTypeId::TIMESTAMP_NS || id == LogicalTypeId::TIMESTAMP_SEC || + id == LogicalTypeId::TIMESTAMP_TZ); + } + DUCKDB_API static bool TypeIsTimestamp(const LogicalType& type) { return TypeIsTimestamp(type.id()); } + DUCKDB_API string ToString() const; + DUCKDB_API bool IsIntegral() const; + DUCKDB_API bool IsNumeric() const; + DUCKDB_API hash_t Hash() const; + DUCKDB_API void SetAlias(string alias); + DUCKDB_API bool HasAlias() const; + DUCKDB_API string GetAlias() const; + + DUCKDB_API static LogicalType MaxLogicalType(const LogicalType& left, const LogicalType& right); + DUCKDB_API static void SetCatalog(LogicalType& type, TypeCatalogEntry* catalog_entry); + DUCKDB_API static TypeCatalogEntry* GetCatalog(const LogicalType& type); + + //! Gets the decimal properties of a numeric type. Fails if the type is not numeric. + DUCKDB_API bool GetDecimalProperties(uint8_t& width, uint8_t& scale) const; + + DUCKDB_API void Verify() const; + + DUCKDB_API bool IsValid() const; + +private: + LogicalTypeId id_; + PhysicalType physical_type_; + shared_ptr type_info_; + +private: + PhysicalType GetInternalType(); + +public: + static constexpr const LogicalTypeId SQLNULL = LogicalTypeId::SQLNULL; + static constexpr const LogicalTypeId UNKNOWN = LogicalTypeId::UNKNOWN; + static constexpr const LogicalTypeId BOOLEAN = LogicalTypeId::BOOLEAN; + static constexpr const LogicalTypeId TINYINT = LogicalTypeId::TINYINT; + static constexpr const LogicalTypeId UTINYINT = LogicalTypeId::UTINYINT; + static constexpr const LogicalTypeId SMALLINT = LogicalTypeId::SMALLINT; + static constexpr const LogicalTypeId USMALLINT = LogicalTypeId::USMALLINT; + static constexpr const LogicalTypeId INTEGER = LogicalTypeId::INTEGER; + static constexpr const LogicalTypeId UINTEGER = LogicalTypeId::UINTEGER; + static constexpr const LogicalTypeId BIGINT = LogicalTypeId::BIGINT; + static constexpr const LogicalTypeId UBIGINT = LogicalTypeId::UBIGINT; + static constexpr const LogicalTypeId FLOAT = LogicalTypeId::FLOAT; + static constexpr const LogicalTypeId DOUBLE = LogicalTypeId::DOUBLE; + static constexpr const LogicalTypeId DATE = LogicalTypeId::DATE; + static constexpr const LogicalTypeId TIMESTAMP = LogicalTypeId::TIMESTAMP; + static constexpr const LogicalTypeId TIMESTAMP_S = LogicalTypeId::TIMESTAMP_SEC; + static constexpr const LogicalTypeId TIMESTAMP_MS = LogicalTypeId::TIMESTAMP_MS; + static constexpr const LogicalTypeId TIMESTAMP_NS = LogicalTypeId::TIMESTAMP_NS; + static constexpr const LogicalTypeId TIME = LogicalTypeId::TIME; + static constexpr const LogicalTypeId TIMESTAMP_TZ = LogicalTypeId::TIMESTAMP_TZ; + static constexpr const LogicalTypeId TIME_TZ = LogicalTypeId::TIME_TZ; + static constexpr const LogicalTypeId VARCHAR = LogicalTypeId::VARCHAR; + static constexpr const LogicalTypeId ANY = LogicalTypeId::ANY; + static constexpr const LogicalTypeId BLOB = LogicalTypeId::BLOB; + static constexpr const LogicalTypeId INTERVAL = LogicalTypeId::INTERVAL; + static constexpr const LogicalTypeId HUGEINT = LogicalTypeId::HUGEINT; + static constexpr const LogicalTypeId UUID = LogicalTypeId::UUID; + static constexpr const LogicalTypeId HASH = LogicalTypeId::UBIGINT; + static constexpr const LogicalTypeId POINTER = LogicalTypeId::POINTER; + static constexpr const LogicalTypeId TABLE = LogicalTypeId::TABLE; + static constexpr const LogicalTypeId LAMBDA = LogicalTypeId::LAMBDA; + static constexpr const LogicalTypeId INVALID = LogicalTypeId::INVALID; + static constexpr const LogicalTypeId ROW_TYPE = LogicalTypeId::BIGINT; + + // explicitly allowing these functions to be capitalized to be in-line with the remaining functions + DUCKDB_API static LogicalType DECIMAL(int width, int scale); // NOLINT + DUCKDB_API static LogicalType VARCHAR_COLLATION(string collation); // NOLINT + DUCKDB_API static LogicalType LIST(LogicalType child); // NOLINT + DUCKDB_API static LogicalType STRUCT(child_list_t children); // NOLINT + DUCKDB_API static LogicalType AGGREGATE_STATE(aggregate_state_t state_type); // NOLINT + DUCKDB_API static LogicalType MAP(LogicalType child); // NOLINT + DUCKDB_API static LogicalType MAP(child_list_t children); // NOLINT + DUCKDB_API static LogicalType MAP(LogicalType key, LogicalType value); // NOLINT + DUCKDB_API static LogicalType UNION(child_list_t members); // NOLINT + DUCKDB_API static LogicalType ENUM(const string& enum_name, Vector& ordered_data, idx_t size); // NOLINT + DUCKDB_API static LogicalType USER(const string& user_type_name); // NOLINT + //! A list of all NUMERIC types (integral and floating point types) + DUCKDB_API static const vector Numeric(); + //! A list of all INTEGRAL types + DUCKDB_API static const vector Integral(); + //! A list of ALL SQL types + DUCKDB_API static const vector AllTypes(); +}; + +struct DecimalType { + DUCKDB_API static uint8_t GetWidth(const LogicalType& type); + DUCKDB_API static uint8_t GetScale(const LogicalType& type); + DUCKDB_API static uint8_t MaxWidth(); +}; + +struct StringType { + DUCKDB_API static string GetCollation(const LogicalType& type); +}; + +struct ListType { + DUCKDB_API static const LogicalType& GetChildType(const LogicalType& type); +}; + +struct UserType { + DUCKDB_API static const string& GetTypeName(const LogicalType& type); +}; + +struct EnumType { + DUCKDB_API static const string& GetTypeName(const LogicalType& type); + DUCKDB_API static int64_t GetPos(const LogicalType& type, const string_t& key); + DUCKDB_API static Vector& GetValuesInsertOrder(const LogicalType& type); + DUCKDB_API static idx_t GetSize(const LogicalType& type); + DUCKDB_API static const string GetValue(const Value& val); + DUCKDB_API static void SetCatalog(LogicalType& type, TypeCatalogEntry* catalog_entry); + DUCKDB_API static TypeCatalogEntry* GetCatalog(const LogicalType& type); + DUCKDB_API static PhysicalType GetPhysicalType(const LogicalType& type); +}; + +struct StructType { + DUCKDB_API static const child_list_t& GetChildTypes(const LogicalType& type); + DUCKDB_API static const LogicalType& GetChildType(const LogicalType& type, idx_t index); + DUCKDB_API static const string& GetChildName(const LogicalType& type, idx_t index); + DUCKDB_API static idx_t GetChildCount(const LogicalType& type); +}; + +struct MapType { + DUCKDB_API static const LogicalType& KeyType(const LogicalType& type); + DUCKDB_API static const LogicalType& ValueType(const LogicalType& type); +}; + +struct UnionType { + DUCKDB_API static const idx_t MAX_UNION_MEMBERS = 256; + DUCKDB_API static idx_t GetMemberCount(const LogicalType& type); + DUCKDB_API static const LogicalType& GetMemberType(const LogicalType& type, idx_t index); + DUCKDB_API static const string& GetMemberName(const LogicalType& type, idx_t index); + DUCKDB_API static const child_list_t CopyMemberTypes(const LogicalType& type); +}; + +struct AggregateStateType { + DUCKDB_API static const string GetTypeName(const LogicalType& type); + DUCKDB_API static const aggregate_state_t& GetStateType(const LogicalType& type); +}; + +DUCKDB_API string LogicalTypeIdToString(LogicalTypeId type); + +DUCKDB_API LogicalTypeId TransformStringToLogicalTypeId(const string& str); + +DUCKDB_API LogicalType TransformStringToLogicalType(const string& str); + +DUCKDB_API LogicalType TransformStringToLogicalType(const string& str, ClientContext& context); + +//! The PhysicalType used by the row identifiers column +extern const PhysicalType ROW_TYPE; + +DUCKDB_API string TypeIdToString(PhysicalType type); +DUCKDB_API idx_t GetTypeIdSize(PhysicalType type); +DUCKDB_API bool TypeIsConstantSize(PhysicalType type); +DUCKDB_API bool TypeIsIntegral(PhysicalType type); +DUCKDB_API bool TypeIsNumeric(PhysicalType type); +DUCKDB_API bool TypeIsInteger(PhysicalType type); + +bool ApproxEqual(float l, float r); +bool ApproxEqual(double l, double r); + +struct aggregate_state_t { + aggregate_state_t(string function_name_p, LogicalType return_type_p, vector bound_argument_types_p) + : function_name(std::move(function_name_p)) + , return_type(std::move(return_type_p)) + , bound_argument_types(std::move(bound_argument_types_p)) {} + + string function_name; + LogicalType return_type; + vector bound_argument_types; +}; + +} // namespace alp_bench diff --git a/benchmarks/include/duckdb/validity_mask.hpp b/benchmarks/include/duckdb/validity_mask.hpp new file mode 100644 index 0000000..fe869d6 --- /dev/null +++ b/benchmarks/include/duckdb/validity_mask.hpp @@ -0,0 +1,291 @@ +#pragma once + +#include "common.hpp" +#include "to_string.hpp" +#include "types.hpp" +#include "vector_size.hpp" + +namespace lwcbench { +struct ValidityMask; + +template +struct TemplatedValidityData { + static constexpr const int BITS_PER_VALUE = sizeof(V) * 8; + static constexpr const V MAX_ENTRY = ~V(0); + +public: + inline explicit TemplatedValidityData(idx_t count) { + auto entry_count = EntryCount(count); + owned_data = unique_ptr(new V[entry_count]); + for (idx_t entry_idx = 0; entry_idx < entry_count; entry_idx++) { + owned_data[entry_idx] = MAX_ENTRY; + } + } + inline TemplatedValidityData(const V* validity_mask, idx_t count) { + D_ASSERT(validity_mask); + auto entry_count = EntryCount(count); + owned_data = unique_ptr(new V[entry_count]); + for (idx_t entry_idx = 0; entry_idx < entry_count; entry_idx++) { + owned_data[entry_idx] = validity_mask[entry_idx]; + } + } + + unique_ptr owned_data; + +public: + static inline idx_t EntryCount(idx_t count) { return (count + (BITS_PER_VALUE - 1)) / BITS_PER_VALUE; } +}; + +using validity_t = uint64_t; + +struct ValidityData : TemplatedValidityData { +public: + DUCKDB_API explicit ValidityData(idx_t count); + DUCKDB_API ValidityData(const ValidityMask& original, idx_t count); +}; + +//! Type used for validity masks +template +struct TemplatedValidityMask { + using ValidityBuffer = TemplatedValidityData; + +public: + static constexpr const int BITS_PER_VALUE = ValidityBuffer::BITS_PER_VALUE; + static constexpr const int STANDARD_ENTRY_COUNT = (STANDARD_VECTOR_SIZE + (BITS_PER_VALUE - 1)) / BITS_PER_VALUE; + static constexpr const int STANDARD_MASK_SIZE = STANDARD_ENTRY_COUNT * sizeof(validity_t); + +public: + inline TemplatedValidityMask() + : validity_mask(nullptr) {} + inline explicit TemplatedValidityMask(idx_t max_count) { Initialize(max_count); } + inline explicit TemplatedValidityMask(V* ptr) + : validity_mask(ptr) {} + inline TemplatedValidityMask(const TemplatedValidityMask& original, idx_t count) { Copy(original, count); } + + static inline idx_t ValidityMaskSize(idx_t count = STANDARD_VECTOR_SIZE) { + return ValidityBuffer::EntryCount(count) * sizeof(V); + } + inline bool AllValid() const { return !validity_mask; } + inline bool CheckAllValid(idx_t count) const { + if (AllValid()) { return true; } + idx_t entry_count = ValidityBuffer::EntryCount(count); + idx_t valid_count = 0; + for (idx_t i = 0; i < entry_count; i++) { + valid_count += validity_mask[i] == ValidityBuffer::MAX_ENTRY; + } + return valid_count == entry_count; + } + + inline bool CheckAllValid(idx_t to, idx_t from) const { + if (AllValid()) { return true; } + for (idx_t i = from; i < to; i++) { + if (!RowIsValid(i)) { return false; } + } + return true; + } + + idx_t CountValid(const idx_t count) const { + if (AllValid() || count == 0) { return count; } + + idx_t valid = 0; + const auto entry_count = EntryCount(count); + for (idx_t entry_idx = 0; entry_idx < entry_count;) { + auto entry = GetValidityEntry(entry_idx++); + // Handle ragged end (if not exactly multiple of BITS_PER_VALUE) + if (entry_idx == entry_count && count % BITS_PER_VALUE != 0) { + idx_t idx_in_entry; + GetEntryIndex(count, entry_idx, idx_in_entry); + for (idx_t i = 0; i < idx_in_entry; ++i) { + valid += idx_t(RowIsValid(entry, i)); + } + break; + } + + // Handle all set + if (AllValid(entry)) { + valid += BITS_PER_VALUE; + continue; + } + + // Count partial entry (Kernighan's algorithm) + while (entry) { + entry &= (entry - 1); + ++valid; + } + } + + return valid; + } + + inline V* GetData() const { return validity_mask; } + inline void Reset() { + validity_mask = nullptr; + validity_data.reset(); + } + + static inline idx_t EntryCount(idx_t count) { return ValidityBuffer::EntryCount(count); } + inline V GetValidityEntry(idx_t entry_idx) const { + if (!validity_mask) { return ValidityBuffer::MAX_ENTRY; } + return validity_mask[entry_idx]; + } + static inline bool AllValid(V entry) { return entry == ValidityBuffer::MAX_ENTRY; } + static inline bool NoneValid(V entry) { return entry == 0; } + static inline bool RowIsValid(V entry, idx_t idx_in_entry) { return entry & (V(1) << V(idx_in_entry)); } + static inline void GetEntryIndex(idx_t row_idx, idx_t& entry_idx, idx_t& idx_in_entry) { + entry_idx = row_idx / BITS_PER_VALUE; + idx_in_entry = row_idx % BITS_PER_VALUE; + } + //! Get an entry that has first-n bits set as valid and rest set as invalid + static inline V EntryWithValidBits(idx_t n) { + if (n == 0) { return V(0); } + return ValidityBuffer::MAX_ENTRY >> (BITS_PER_VALUE - n); + } + + //! RowIsValidUnsafe should only be used if AllValid() is false: it achieves the same as RowIsValid but skips a + //! not-null check + inline bool RowIsValidUnsafe(idx_t row_idx) const { + D_ASSERT(validity_mask); + idx_t entry_idx, idx_in_entry; + GetEntryIndex(row_idx, entry_idx, idx_in_entry); + auto entry = GetValidityEntry(entry_idx); + return RowIsValid(entry, idx_in_entry); + } + + //! Returns true if a row is valid (i.e. not null), false otherwise + inline bool RowIsValid(idx_t row_idx) const { + if (!validity_mask) { return true; } + return RowIsValidUnsafe(row_idx); + } + + //! Same as SetValid, but skips a null check on validity_mask + inline void SetValidUnsafe(idx_t row_idx) { + D_ASSERT(validity_mask); + idx_t entry_idx, idx_in_entry; + GetEntryIndex(row_idx, entry_idx, idx_in_entry); + validity_mask[entry_idx] |= (V(1) << V(idx_in_entry)); + } + + //! Marks the entry at the specified row index as valid (i.e. not-null) + inline void SetValid(idx_t row_idx) { + if (!validity_mask) { + // if AllValid() we don't need to do anything + // the row is already valid + return; + } + SetValidUnsafe(row_idx); + } + + //! Marks the bit at the specified entry as invalid (i.e. null) + inline void SetInvalidUnsafe(idx_t entry_idx, idx_t idx_in_entry) { + D_ASSERT(validity_mask); + validity_mask[entry_idx] &= ~(V(1) << V(idx_in_entry)); + } + + //! Marks the bit at the specified row index as invalid (i.e. null) + inline void SetInvalidUnsafe(idx_t row_idx) { + idx_t entry_idx, idx_in_entry; + GetEntryIndex(row_idx, entry_idx, idx_in_entry); + SetInvalidUnsafe(entry_idx, idx_in_entry); + } + + //! Marks the entry at the specified row index as invalid (i.e. null) + inline void SetInvalid(idx_t row_idx) { + if (!validity_mask) { + D_ASSERT(row_idx <= STANDARD_VECTOR_SIZE); + Initialize(STANDARD_VECTOR_SIZE); + } + SetInvalidUnsafe(row_idx); + } + + //! Mark the entry at the specified index as either valid or invalid (non-null or null) + inline void Set(idx_t row_idx, bool valid) { + if (valid) { + SetValid(row_idx); + } else { + SetInvalid(row_idx); + } + } + + //! Ensure the validity mask is writable, allocating space if it is not initialized + inline void EnsureWritable() { + if (!validity_mask) { Initialize(); } + } + + //! Marks exactly "count" bits in the validity mask as invalid (null) + inline void SetAllInvalid(idx_t count) { + EnsureWritable(); + if (count == 0) { return; } + auto last_entry_index = ValidityBuffer::EntryCount(count) - 1; + for (idx_t i = 0; i < last_entry_index; i++) { + validity_mask[i] = 0; + } + auto last_entry_bits = count % static_cast(BITS_PER_VALUE); + validity_mask[last_entry_index] = (last_entry_bits == 0) ? 0 : (ValidityBuffer::MAX_ENTRY << (last_entry_bits)); + } + + //! Marks exactly "count" bits in the validity mask as valid (not null) + inline void SetAllValid(idx_t count) { + EnsureWritable(); + if (count == 0) { return; } + auto last_entry_index = ValidityBuffer::EntryCount(count) - 1; + for (idx_t i = 0; i < last_entry_index; i++) { + validity_mask[i] = ValidityBuffer::MAX_ENTRY; + } + auto last_entry_bits = count % static_cast(BITS_PER_VALUE); + validity_mask[last_entry_index] |= + (last_entry_bits == 0) ? ValidityBuffer::MAX_ENTRY : ~(ValidityBuffer::MAX_ENTRY << (last_entry_bits)); + } + + inline bool IsMaskSet() const { + if (validity_mask) { return true; } + return false; + } + +public: + inline void Initialize(validity_t* validity) { + validity_data.reset(); + validity_mask = validity; + } + inline void Initialize(const TemplatedValidityMask& other) { + validity_mask = other.validity_mask; + validity_data = other.validity_data; + } + inline void Initialize(idx_t count = STANDARD_VECTOR_SIZE) { + validity_data = make_buffer(count); + validity_mask = validity_data->owned_data.get(); + } + inline void Copy(const TemplatedValidityMask& other, idx_t count) { + if (other.AllValid()) { + validity_data = nullptr; + validity_mask = nullptr; + } else { + validity_data = make_buffer(other.validity_mask, count); + validity_mask = validity_data->owned_data.get(); + } + } + +protected: + V* validity_mask; + buffer_ptr validity_data; +}; + +struct ValidityMask : public TemplatedValidityMask { +public: + inline ValidityMask() + : TemplatedValidityMask(nullptr) {} + inline explicit ValidityMask(idx_t max_count) + : TemplatedValidityMask(max_count) {} + inline explicit ValidityMask(validity_t* ptr) + : TemplatedValidityMask(ptr) {} + inline ValidityMask(const ValidityMask& original, idx_t count) + : TemplatedValidityMask(original, count) {} + +public: + DUCKDB_API void Resize(idx_t old_size, idx_t new_size); + + DUCKDB_API void Slice(const ValidityMask& other, idx_t offset, idx_t end); + DUCKDB_API void Combine(const ValidityMask& other, idx_t count); + DUCKDB_API string ToString(idx_t count) const; +}; + +} // namespace lwcbench diff --git a/benchmarks/include/duckdb/vector.hpp b/benchmarks/include/duckdb/vector.hpp new file mode 100644 index 0000000..1aa8f5b --- /dev/null +++ b/benchmarks/include/duckdb/vector.hpp @@ -0,0 +1,7 @@ +#pragma once + +#include + +namespace alp_bench { +using std::vector; +} diff --git a/benchmarks/include/duckdb/vector_size.hpp b/benchmarks/include/duckdb/vector_size.hpp new file mode 100644 index 0000000..c130e77 --- /dev/null +++ b/benchmarks/include/duckdb/vector_size.hpp @@ -0,0 +1,19 @@ +#pragma once + +#include "types.hpp" + +namespace lwcbench { + +//! The vector size used in the execution engine +#ifndef STANDARD_VECTOR_SIZE +#define STANDARD_VECTOR_SIZE 2048 +#endif + +#if ((STANDARD_VECTOR_SIZE & (STANDARD_VECTOR_SIZE - 1)) != 0) +#error Vector size should be a power of two +#endif + +//! Zero selection vector: completely filled with the value 0 [READ ONLY] +extern const sel_t ZERO_VECTOR[STANDARD_VECTOR_SIZE]; + +} // namespace lwcbench diff --git a/benchmarks/include/duckdb/winapi.hpp b/benchmarks/include/duckdb/winapi.hpp new file mode 100644 index 0000000..ec113bb --- /dev/null +++ b/benchmarks/include/duckdb/winapi.hpp @@ -0,0 +1,25 @@ +#pragma once + +#ifndef DUCKDB_API +#ifdef _WIN32 +#if defined(DUCKDB_BUILD_LIBRARY) && !defined(DUCKDB_BUILD_LOADABLE_EXTENSION) +#define DUCKDB_API __declspec(dllexport) +#else +#define DUCKDB_API __declspec(dllimport) +#endif +#else +#define DUCKDB_API +#endif +#endif + +#ifndef DUCKDB_EXTENSION_API +#ifdef _WIN32 +#ifdef DUCKDB_BUILD_LOADABLE_EXTENSION +#define DUCKDB_EXTENSION_API __declspec(dllexport) +#else +#define DUCKDB_EXTENSION_API +#endif +#else +#define DUCKDB_EXTENSION_API __attribute__((visibility("default"))) +#endif +#endif diff --git a/benchmarks/include/gorillas/gorillas.hpp b/benchmarks/include/gorillas/gorillas.hpp new file mode 100644 index 0000000..5ccf704 --- /dev/null +++ b/benchmarks/include/gorillas/gorillas.hpp @@ -0,0 +1,224 @@ +#pragma once +#include "chimp/bit_reader.hpp" +#include "chimp/flag_buffer.hpp" +#include "chimp/leading_zero_buffer.hpp" +#include "chimp/output_bit_stream.hpp" +#include "duckdb/duckdb.h" +#include "duckdb/exception.hpp" +#include "duckdb/fast_mem.hpp" +#include "duckdb/likely.hpp" +#include "duckdb/limits.hpp" +#include "gorillas/gorillas_utils.hpp" +#include + +namespace alp_bench { + +template +struct GorillasCompressionState { + + GorillasCompressionState() + : previous_leading_zeros(NumericLimits::Maximum()) + , previous_trailing_zeros(0) { + previous_value = 0; + } + + inline void SetLeadingZeros(int8_t value = NumericLimits::Maximum()) { + this->previous_leading_zeros = value; + } + + inline void SetTrailingZeros(int8_t value = 0) { this->previous_trailing_zeros = value; } + + void Flush() { + // + } + + // Reset the state + void Reset() { + first = true; + SetLeadingZeros(); + SetTrailingZeros(); + flag_buffer.Reset(); + previous_value = 0; + } + + CHIMP_TYPE BitsWritten() const { return output.BitsWritten() + flag_buffer.BitsWritten(); } + + OutputBitStream output; // The stream to write to + FlagBuffer flag_buffer; + uint8_t previous_leading_zeros; //! The leading zeros of the reference value + uint8_t previous_trailing_zeros; + CHIMP_TYPE previous_value = 0; + bool first = true; +}; + +template +class GorillasCompression { +public: + using State = GorillasCompressionState; + + //! The amount of bits needed to store an index between 0-127 + static constexpr uint8_t SIGNIFICANT_BITS_SIZE = + 6; // The amount needed to store the maximum number of significant bits (0-63) + static constexpr uint8_t LEADING_ZEROS_BITS_SIZE = 5; + static constexpr uint8_t BIT_SIZE = sizeof(CHIMP_TYPE) * 8; + + static void Store(CHIMP_TYPE in, State& state) { + if (state.first) { + WriteFirst(in, state); + } else { + CompressValue(in, state); + } + } + + //! Write the content of the bit buffer to the stream + static void Flush(State& state) { + if (!EMPTY) { state.output.Flush(); } + } + + static void WriteFirst(CHIMP_TYPE in, State& state) { + state.output.template WriteValue(in); + state.previous_value = in; + state.first = false; + } + + static void CompressValue(CHIMP_TYPE in, State& state) { + + CHIMP_TYPE xor_result; + xor_result = (CHIMP_TYPE)in ^ state.previous_value; + + // Compress the value + if (xor_result == 0) { + state.flag_buffer.Insert(GorillasConstants::Flags::VALUE_IDENTICAL); + } else { // Values are not identical + + uint8_t leading_zeros = CountZeros::Leading(xor_result); + if (leading_zeros >= 32) { // To prevent overflow + leading_zeros = 31; + } + + uint8_t trailing_zeros = CountZeros::Trailing(xor_result); + + if (leading_zeros >= state.previous_leading_zeros && trailing_zeros >= state.previous_trailing_zeros) { + state.flag_buffer.Insert(alp_bench::GorillasConstants::Flags::LEADING_HIGHER_OR_EQUAL); + uint32_t significant_bits = BIT_SIZE - state.previous_leading_zeros - state.previous_trailing_zeros; + state.output.template WriteValue(xor_result >> state.previous_trailing_zeros, + significant_bits); + } else { + state.flag_buffer.Insert(alp_bench::GorillasConstants::Flags::LEADING_LOWER); + uint32_t significant_bits = BIT_SIZE - leading_zeros - trailing_zeros; + + state.output.template WriteValue(leading_zeros); + + state.output.template WriteValue(significant_bits - 1); + state.output.template WriteValue(xor_result >> trailing_zeros, significant_bits); + state.SetLeadingZeros(leading_zeros); + state.SetTrailingZeros(trailing_zeros); + } + } + state.previous_value = in; + } +}; + +//===--------------------------------------------------------------------===// +// Decompression +//===--------------------------------------------------------------------===// + +template +struct GorillasDecompressionState { +public: + GorillasDecompressionState() + : reference_value(0) + , first(true) { + ResetZeros(); + } + + void Reset() { + ResetZeros(); + reference_value = 0; + first = true; + } + + inline void ResetZeros() { + leading_zeros = NumericLimits::Maximum(); + trailing_zeros = 0; + } + + inline void SetLeadingZeros(uint8_t value) { leading_zeros = value; } + + inline void SetTrailingZeros(uint8_t value) { + D_ASSERT(value <= sizeof(CHIMP_TYPE) * 8); + trailing_zeros = value; + } + + uint8_t LeadingZeros() const { return leading_zeros; } + uint8_t TrailingZeros() const { return trailing_zeros; } + + BitReader input; + uint8_t leading_zeros; + uint8_t trailing_zeros; + CHIMP_TYPE reference_value = 0; + + bool first; +}; + +template +struct GorillasDecompression { +public: + using DecompressState = GorillasDecompressionState; + + static constexpr uint8_t BIT_SIZE = sizeof(CHIMP_TYPE) * 8; + static constexpr uint8_t SIGNIFICANT_BITS_SIZE = 6; + static constexpr uint8_t LEADING_ZEROS_BITS_SIZE = 5; + + static inline CHIMP_TYPE Load(GorillasConstants::Flags flag, DecompressState& state) { + if (DUCKDB_UNLIKELY(state.first)) { + return LoadFirst(state); + } else { + return DecompressValue(flag, state); + } + } + + static inline CHIMP_TYPE LoadFirst(DecompressState& state) { + CHIMP_TYPE result = state.input.template ReadValue(); + state.first = false; + state.reference_value = result; + return result; + } + + static inline CHIMP_TYPE DecompressValue(GorillasConstants::Flags flag, DecompressState& state) { + CHIMP_TYPE result; + switch (flag) { + case GorillasConstants::Flags::VALUE_IDENTICAL: { + //! Value is identical to previous value + result = state.reference_value; + break; + } + case GorillasConstants::Flags::LEADING_LOWER: { + // state.leading_zeros = leading_zeros[leading_zero_index++]; // comment/uncomment here + state.leading_zeros = state.input.template ReadValue(LEADING_ZEROS_BITS_SIZE); + + auto significant_bits = state.input.template ReadValue(SIGNIFICANT_BITS_SIZE) + 1; + state.trailing_zeros = BIT_SIZE - significant_bits - state.leading_zeros; + result = state.input.template ReadValue(BIT_SIZE - state.leading_zeros - state.trailing_zeros); + result <<= state.trailing_zeros; + result ^= state.reference_value; + break; + } + case GorillasConstants::Flags::LEADING_HIGHER_OR_EQUAL: { + result = state.input.template ReadValue(BIT_SIZE - state.leading_zeros - state.trailing_zeros); + result <<= state.trailing_zeros; + result ^= state.reference_value; + break; + } + default: + // std::cout << "Gorillas compression flag with value not recognized "; + // std::cout << flag; + break; + // throw InternalException("Gorillas compression flag with value %d not recognized", flag); + } + state.reference_value = result; + return result; + } +}; + +} // namespace alp_bench diff --git a/benchmarks/include/gorillas/gorillas_utils.hpp b/benchmarks/include/gorillas/gorillas_utils.hpp new file mode 100644 index 0000000..97a7ab6 --- /dev/null +++ b/benchmarks/include/gorillas/gorillas_utils.hpp @@ -0,0 +1,70 @@ +#pragma once + +#include "duckdb/duckdb.h" + +#ifdef _MSC_VER +#define __restrict__ +#define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__ +#define __ORDER_LITTLE_ENDIAN__ 2 +#include +static inline int __builtin_ctzll(unsigned long long x) { +#ifdef _WIN64 + unsigned long ret; + _BitScanForward64(&ret, x); + return (int)ret; +#else + unsigned long low, high; + bool low_set = _BitScanForward(&low, (unsigned __int32)(x)) != 0; + _BitScanForward(&high, (unsigned __int32)(x >> 32)); + high += 32; + return low_set ? low : high; +#endif +} +static inline int __builtin_clzll(unsigned long long mask) { + unsigned long where; +// BitScanReverse scans from MSB to LSB for first set bit. +// Returns 0 if no set bit is found. +#if defined(_WIN64) + if (_BitScanReverse64(&where, mask)) return static_cast(63 - where); +#elif defined(_WIN32) + // Scan the high 32 bits. + if (_BitScanReverse(&where, static_cast(mask >> 32))) + return static_cast(63 - (where + 32)); // Create a bit offset from the MSB. + // Scan the low 32 bits. + if (_BitScanReverse(&where, static_cast(mask))) return static_cast(63 - where); +#else +#error "Implementation of __builtin_clzll required" +#endif + return 64; // Undefined Behavior. +} + +static inline int __builtin_ctz(unsigned int value) { + unsigned long trailing_zero = 0; + + if (_BitScanForward(&trailing_zero, value)) { + return trailing_zero; + } else { + // This is undefined, I better choose 32 than 0 + return 32; + } +} + +static inline int __builtin_clz(unsigned int value) { + unsigned long leading_zero = 0; + + if (_BitScanReverse(&leading_zero, value)) { + return 31 - leading_zero; + } else { + // Same remarks as above + return 32; + } +} + +#endif + +namespace alp_bench { +struct GorillasConstants { + enum class Flags : uint8_t { VALUE_IDENTICAL = 0, LEADING_HIGHER_OR_EQUAL = 2, LEADING_LOWER = 3 }; +}; + +} // namespace alp_bench diff --git a/benchmarks/include/patas/patas.hpp b/benchmarks/include/patas/patas.hpp new file mode 100644 index 0000000..83a32a3 --- /dev/null +++ b/benchmarks/include/patas/patas.hpp @@ -0,0 +1,124 @@ +#pragma once + +#include "chimp/byte_reader.hpp" +#include "chimp/byte_writer.hpp" +#include "chimp/chimp_utils.hpp" +#include "chimp/packed_data.hpp" +#include "chimp/ring_buffer.hpp" +#include +#include + +namespace alp_bench { namespace patas { + +struct PatasUnpackedValueStats { + uint8_t significant_bytes; + uint8_t trailing_zeros; + uint8_t index_diff; +}; + +template +class PatasCompressionState { +public: + PatasCompressionState() + : index(0) + , first(true) {} + +public: + void Reset() { + index = 0; + first = true; + ring_buffer.Reset(); + packed_data_buffer.Reset(); + } + void SetOutputBuffer(uint8_t* output) { + byte_writer.SetStream(output); + Reset(); + } + idx_t Index() const { return index; } + +public: + void UpdateMetadata(uint8_t trailing_zero, uint8_t byte_count, uint8_t index_diff) { + if (!EMPTY) { + packed_data_buffer.Insert(PackedDataUtils::Pack(index_diff, byte_count, trailing_zero)); + } + index++; + } + +public: + ByteWriter byte_writer; + PackedDataBuffer packed_data_buffer; + idx_t index; + RingBuffer ring_buffer; + bool first; +}; + +template +struct PatasCompression { + using State = PatasCompressionState; + static constexpr uint8_t EXACT_TYPE_BITSIZE = sizeof(EXACT_TYPE) * 8; + + static void Store(EXACT_TYPE value, State& state) { + if (state.first) { + StoreFirst(value, state); + } else { + StoreCompressed(value, state); + } + } + + static void StoreFirst(EXACT_TYPE value, State& state) { + // write first value, uncompressed + state.ring_buffer.template Insert(value); + state.byte_writer.template WriteValue(value); + state.first = false; + state.UpdateMetadata(0, sizeof(EXACT_TYPE), 0); + } + + static void StoreCompressed(EXACT_TYPE value, State& state) { + auto key = state.ring_buffer.Key(value); + uint64_t reference_index = state.ring_buffer.IndexOf(key); + + // Find the reference value to use when compressing the current value + const bool exceeds_highest_index = reference_index > state.ring_buffer.Size(); + const bool difference_too_big = + ((state.ring_buffer.Size() + 1) - reference_index) >= ChimpConstants::BUFFER_SIZE; + if (exceeds_highest_index || difference_too_big) { + // Reference index is not in range, use the directly previous value + reference_index = state.ring_buffer.Size(); + } + const auto reference_value = state.ring_buffer.Value(reference_index % ChimpConstants::BUFFER_SIZE); + + // XOR with previous value + EXACT_TYPE xor_result = value ^ reference_value; + + // Figure out the trailing zeros (max 6 bits) + const uint8_t trailing_zero = CountZeros::Trailing(xor_result); + + const uint8_t leading_zero = CountZeros::Leading(xor_result); + + const bool is_equal = xor_result == 0; + + // Figure out the significant bytes (max 3 bits) + const uint8_t significant_bits = !is_equal * (EXACT_TYPE_BITSIZE - trailing_zero - leading_zero); + const uint8_t significant_bytes = (significant_bits >> 3) + ((significant_bits & 7) != 0); + + // Avoid an invalid shift error when xor_result is 0 + state.byte_writer.template WriteValue(xor_result >> (trailing_zero - is_equal), significant_bits); + + state.ring_buffer.Insert(value); + + const uint8_t index_difference = state.ring_buffer.Size() - reference_index; + state.UpdateMetadata(trailing_zero - is_equal, significant_bytes, index_difference); + } +}; + +// Decompression + +template +struct PatasDecompression { + static inline EXACT_TYPE + DecompressValue(ByteReader& byte_reader, uint8_t byte_count, uint8_t trailing_zero, EXACT_TYPE previous) { + return (byte_reader.ReadValue(byte_count, trailing_zero) << trailing_zero) ^ previous; + } +}; + +}} // namespace alp_bench::patas diff --git a/benchmarks/test/CMakeLists.txt b/benchmarks/test/CMakeLists.txt new file mode 100644 index 0000000..10a08d0 --- /dev/null +++ b/benchmarks/test/CMakeLists.txt @@ -0,0 +1,20 @@ +# Test Patas: ---------------------------------------------------------------------------------------------------------- +add_executable(test_patas test_patas.cpp) +target_link_libraries(test_patas PRIVATE gtest_main) +gtest_discover_tests(test_patas) + +# Test CHIMP128: ------------------------------------------------------------------------------------------------------- +add_executable(test_chimp128 test_chimp128.cpp) +target_link_libraries(test_chimp128 PRIVATE gtest_main) +gtest_discover_tests(test_chimp128) + +# Test CHIMP: ---------------------------------------------------------------------------------------------------------- +add_executable(test_chimp test_chimp.cpp) +target_link_libraries(test_chimp PRIVATE gtest_main) +gtest_discover_tests(test_chimp) + +# Test GORILLAS: ---------------------------------------------------------------------------------------------------------- +add_executable(test_gorillas test_gorillas.cpp) +target_link_libraries(test_gorillas PRIVATE gtest_main) +gtest_discover_tests(test_gorillas) + diff --git a/benchmarks/test/test_chimp.cpp b/benchmarks/test/test_chimp.cpp new file mode 100644 index 0000000..0e2199b --- /dev/null +++ b/benchmarks/test/test_chimp.cpp @@ -0,0 +1,114 @@ +#include "chimp/chimp.hpp" +#include "data.hpp" +#include "gtest/gtest.h" +#include + +class chimp_test : public ::testing::Test { +public: + uint8_t* data_arr; + uint8_t* flags_arr; + uint8_t* leading_zero_arr; + double* dbl_arr; + double* dec_dbl_p; + uint64_t* dec_arr; + uint64_t* uint64_p; + alp_bench::ChimpCompressionState state; + alp_bench::ChimpConstants::Flags* flags; + uint8_t* leading_zero_unpacked; + alp_bench::FlagBuffer flag_buffer; + alp_bench::LeadingZeroBuffer leading_zero_buffer; + alp_bench::ChimpDecompressionState chimp_de_state; + uint32_t leading_zero_index; + uint8_t leading_zero_block_count; + idx_t leading_zero_block_size; + + void SetUp() override { + dbl_arr = new double[1024]; + data_arr = new uint8_t[8096]; + flags_arr = new uint8_t[1025]; + leading_zero_arr = new uint8_t[1024]; + dec_arr = new uint64_t[1024]; + leading_zero_unpacked = new uint8_t[1024]; + flags = new alp_bench::ChimpConstants::Flags[1024]; + } + + ~chimp_test() override { + delete[] dbl_arr; + delete[] data_arr; + delete[] flags_arr; + delete[] leading_zero_arr; + delete[] dec_arr; + } +}; + +TEST_F(chimp_test, test_chimp) { + for (auto& dataset : alp_bench::alp_dataset) { + std::ifstream ifile(dataset.sample_csv_file_path, std::ios::in); + ASSERT_EQ(ifile.fail(), false); + + // Read Data + double num = 0.0; + // keep storing values from the text file so long as data exists: + size_t c {0}; + while (ifile >> num) { + dbl_arr[c] = num; + c = c + 1; + } + + // Init Encoding + state.Reset(); + state.output.SetStream(data_arr); + state.leading_zero_buffer.SetBuffer(leading_zero_arr); + state.flag_buffer.SetBuffer(flags_arr); + + /* + * + * Encode + * + */ + uint64_p = reinterpret_cast(dbl_arr); + for (size_t i {0}; i < 1024; ++i) { + alp_bench::ChimpCompression::Store(uint64_p[i], state); + } + + state.Flush(); + state.output.Flush(); + + // Init decoding + leading_zero_block_count = state.leading_zero_buffer.BlockCount(); + leading_zero_block_size = static_cast(leading_zero_block_count) * 8; + leading_zero_index = 0; + chimp_de_state.input.SetStream(data_arr); + flag_buffer.SetBuffer(flags_arr); + leading_zero_buffer.SetBuffer(leading_zero_arr); + + /* + * + * DECODE + * + */ + flags[0] = alp_bench::ChimpConstants::Flags::VALUE_IDENTICAL; // First value doesn't require a flag + for (idx_t i = 0; i < 1023; i++) { + flags[1 + i] = (alp_bench::ChimpConstants::Flags)flag_buffer.Extract(); + } + + for (idx_t i = 0; i < leading_zero_block_size; i++) { + leading_zero_unpacked[i] = + alp_bench::ChimpConstants::Decompression::LEADING_REPRESENTATION[leading_zero_buffer.Extract()]; + } + + for (idx_t i = 0; i < 1024; i++) { + dec_arr[i] = alp_bench::ChimpDecompression::Load( + flags[i], leading_zero_unpacked, leading_zero_index, chimp_de_state); + } + chimp_de_state.Reset(); + + dec_dbl_p = reinterpret_cast(dec_arr); + + for (size_t i = 0; i < 1024; ++i) { + ASSERT_EQ(dbl_arr[i], dec_dbl_p[i]); + } + + ifile.close(); + } +} \ No newline at end of file diff --git a/benchmarks/test/test_chimp128.cpp b/benchmarks/test/test_chimp128.cpp new file mode 100644 index 0000000..2a96718 --- /dev/null +++ b/benchmarks/test/test_chimp128.cpp @@ -0,0 +1,150 @@ +#include "chimp/chimp128.hpp" +#include "data.hpp" +#include "gtest/gtest.h" +#include + +class chimp128_test : public ::testing::Test { +public: + uint8_t* data_arr; + uint8_t* flags_arr; + uint8_t* leading_zero_arr; + uint16_t* packed_data_arr; + double* dbl_arr; + double* dec_dbl_p; + uint64_t* dec_arr; + uint64_t* uint64_p; + + // Encode + alp_bench::Chimp128CompressionState com_stt; + uint8_t leading_zero_block_count; + + // Decode + idx_t leading_zero_block_size; + uint32_t unpacked_index; + uint32_t leading_zero_index; + alp_bench::FlagBuffer flag_buffer; + alp_bench::LeadingZeroBuffer leading_zero_buffer; + alp_bench::Chimp128DecompressionState chimp_de_state; + + alp_bench::ChimpConstants::Flags* flags; + uint8_t* leading_zero_unpacked; + alp_bench::UnpackedData* unpacked_data_arr; + + void SetUp() override { + dbl_arr = new double[1024]; + data_arr = new uint8_t[8096]; + flags_arr = new uint8_t[1025]; + leading_zero_arr = new uint8_t[1024]; + dec_arr = new uint64_t[1024]; + packed_data_arr = new uint16_t[1024]; + flags = new alp_bench::ChimpConstants::Flags[1025]; + leading_zero_unpacked = new uint8_t[1024]; + unpacked_data_arr = new alp_bench::UnpackedData[1024]; + } + + ~chimp128_test() override { + delete[] dbl_arr; + delete[] data_arr; + delete[] flags_arr; + delete[] leading_zero_arr; + delete[] dec_arr; + delete[] packed_data_arr; + delete[] flags; + delete[] leading_zero_unpacked; + delete[] unpacked_data_arr; + } +}; + +TEST_F(chimp128_test, test_chimp) { + for (auto& dataset : alp_bench::alp_dataset) { + std::ifstream ifile(dataset.sample_csv_file_path, std::ios::in); + ASSERT_EQ(ifile.fail(), false); + + // Read Data + double num = 0.0; + // keep storing values from the text file so long as data exists: + size_t c {0}; + while (ifile >> num) { + dbl_arr[c] = num; + c = c + 1; + } + + // Init Encoding + com_stt.Reset(); + com_stt.output.SetStream(data_arr); + com_stt.leading_zero_buffer.SetBuffer(leading_zero_arr); + com_stt.flag_buffer.SetBuffer(flags_arr); + com_stt.packed_data_buffer.SetBuffer(packed_data_arr); + + /* + * + * Encode + * + */ + uint64_p = reinterpret_cast(dbl_arr); + for (size_t i {0}; i < 1024; ++i) { + alp_bench::Chimp128Compression::Store(uint64_p[i], com_stt); + } + com_stt.Flush(); + com_stt.output.Flush(); + + // Init decoding + leading_zero_block_count = com_stt.leading_zero_buffer.BlockCount(); + leading_zero_block_size = static_cast(leading_zero_block_count) * 8; + unpacked_index = 0; + leading_zero_index = 0; + chimp_de_state.input.SetStream(data_arr); + flag_buffer.SetBuffer(flags_arr); + leading_zero_buffer.SetBuffer(leading_zero_arr); + + /* + * + * DECODE + * + */ + + // Decode flags + flags[0] = alp_bench::ChimpConstants::Flags::VALUE_IDENTICAL; // First value doesn't require a flag + for (idx_t i = 0; i < 1023; i++) { + flags[1 + i] = (alp_bench::ChimpConstants::Flags)flag_buffer.Extract(); + } + + // Decode leading zero + for (idx_t i = 0; i < leading_zero_block_size; i++) { + leading_zero_unpacked[i] = + alp_bench::ChimpConstants::Decompression::LEADING_REPRESENTATION[leading_zero_buffer.Extract()]; + } + + /* + * count how many cases of 'TRAILING_EXCEEDS_THRESHOLD' are based on the flags + * that is the exact number of packed data blocks + * that is the case in which in Chimp128 they save data in a block of 16 bits + */ + idx_t packed_data_block_count = 0; + for (idx_t i = 0; i < 1024; i++) { + packed_data_block_count += flags[1 + i] == alp_bench::ChimpConstants::Flags::TRAILING_EXCEEDS_THRESHOLD; + } + + for (idx_t i = 0; i < packed_data_block_count; i++) { + alp_bench::PackedDataUtils::Unpack(((uint16_t*)packed_data_arr)[i], unpacked_data_arr[i]); + if (unpacked_data_arr[i].significant_bits == 0) { unpacked_data_arr[i].significant_bits = 64; } + unpacked_data_arr[i].leading_zero = + alp_bench::ChimpConstants::Decompression::LEADING_REPRESENTATION[unpacked_data_arr[i].leading_zero]; + } + + chimp_de_state.Reset(); + + for (idx_t i = 0; i < 1024; i++) { + dec_arr[i] = alp_bench::Chimp128Decompression::Load( + flags[i], leading_zero_unpacked, leading_zero_index, unpacked_data_arr, unpacked_index, chimp_de_state); + } + + dec_dbl_p = reinterpret_cast(dec_arr); + + for (size_t i = 0; i < 1024; ++i) { + ASSERT_EQ(dbl_arr[i], dec_dbl_p[i]); + } + + ifile.close(); + } +} \ No newline at end of file diff --git a/benchmarks/test/test_gorillas.cpp b/benchmarks/test/test_gorillas.cpp new file mode 100644 index 0000000..4c4d13a --- /dev/null +++ b/benchmarks/test/test_gorillas.cpp @@ -0,0 +1,98 @@ +#include "data.hpp" +#include "gorillas/gorillas.hpp" +#include "gtest/gtest.h" +#include + +class gorillas_test : public ::testing::Test { +public: + uint8_t* data_arr; + uint8_t* flags_arr; + double* dbl_arr; + double* dec_dbl_p; + uint64_t* dec_arr; + uint64_t* uint64_p; + alp_bench::GorillasCompressionState state; + alp_bench::GorillasConstants::Flags* flags; + alp_bench::FlagBuffer flag_buffer; + alp_bench::GorillasDecompressionState gorillas_de_state; + + void SetUp() override { + dbl_arr = new double[1024]; + data_arr = new uint8_t[8192 + 1024]; + flags_arr = new uint8_t[1025]; + dec_arr = new uint64_t[1024]; + flags = new alp_bench::GorillasConstants::Flags[1024]; + } + + ~gorillas_test() override { + delete[] dbl_arr; + delete[] data_arr; + delete[] flags_arr; + delete[] dec_arr; + } +}; + +TEST_F(gorillas_test, test_gorillas) { + for (auto& dataset : alp_bench::alp_dataset) { + std::ifstream ifile(dataset.sample_csv_file_path, std::ios::in); + ASSERT_EQ(ifile.fail(), false); + + // Read Data + double num = 0.0; + // keep storing values from the text file so long as data exists: + size_t c {0}; + while (ifile >> num) { + dbl_arr[c] = num; + c = c + 1; + } + + // Init Encoding + state.Reset(); + state.output.SetStream(data_arr); + state.flag_buffer.SetBuffer(flags_arr); + + /* + * + * Encode + * + */ + uint64_p = reinterpret_cast(dbl_arr); + for (size_t i {0}; i < 1024; ++i) { + alp_bench::GorillasCompression::Store(uint64_p[i], state); + } + + state.Flush(); + state.output.Flush(); + + // Init decoding + gorillas_de_state.input.SetStream(data_arr); + flag_buffer.SetBuffer(flags_arr); + + /* + * + * DECODE + * + */ + flags[0] = alp_bench::GorillasConstants::Flags::VALUE_IDENTICAL; // First value doesn't require a flag + for (idx_t i = 0; i < 1023; i++) { + flags[1 + i] = (alp_bench::GorillasConstants::Flags)flag_buffer.Extract(); + } + + for (idx_t i = 0; i < 1024; i++) { + dec_arr[i] = alp_bench::GorillasDecompression::Load(flags[i], gorillas_de_state); + } + gorillas_de_state.Reset(); + + dec_dbl_p = reinterpret_cast(dec_arr); + + for (size_t i = 0; i < 1024; ++i) { + auto l = dbl_arr[i]; + auto r = dec_dbl_p[i]; + if (l != r) { std::cerr << l << ", " << r << dataset.name << "\n"; } + + ASSERT_EQ(dbl_arr[i], dec_dbl_p[i]); + } + + ifile.close(); + } +} \ No newline at end of file diff --git a/benchmarks/test/test_patas.cpp b/benchmarks/test/test_patas.cpp new file mode 100644 index 0000000..79c4141 --- /dev/null +++ b/benchmarks/test/test_patas.cpp @@ -0,0 +1,105 @@ +#include "data.hpp" +#include "patas/patas.hpp" +#include "gtest/gtest.h" +#include + +class patas_test : public ::testing::Test { +public: + uint8_t* data_arr; + uint16_t* packed_data_arr; + double* dbl_arr; + double* dec_dbl_p; + uint64_t* dec_arr; + uint64_t* uint64_p; + + // Encode + uint16_t* packed_metadata; + alp_bench::patas::PatasCompressionState patas_state; + + // Decode + alp_bench::patas::PatasUnpackedValueStats* unpacked_data; + alp_bench::ByteReader byte_reader; + + void SetUp() override { + dbl_arr = new double[1024]; + data_arr = new uint8_t[8096]; + dec_arr = new uint64_t[1024]; + packed_data_arr = new uint16_t[1024]; + packed_metadata = new uint16_t[1024]; + unpacked_data = new alp_bench::patas::PatasUnpackedValueStats[1024]; + } + + ~patas_test() override { + delete[] dbl_arr; + delete[] data_arr; + delete[] dec_arr; + delete[] packed_data_arr; + delete[] packed_metadata; + delete[] unpacked_data; + } +}; + +TEST_F(patas_test, one_vec) { + for (auto& dataset : alp_bench::alp_dataset) { + std::ifstream ifile(dataset.sample_csv_file_path, std::ios::in); + ASSERT_EQ(ifile.fail(), false); + + // Read Data + double num = 0.0; + // keep storing values from the text file so long as data exists: + size_t c {0}; + while (ifile >> num) { + dbl_arr[c] = num; + c = c + 1; + } + + // Init Encoding + patas_state.Reset(); + patas_state.SetOutputBuffer(data_arr); + patas_state.packed_data_buffer.SetBuffer(packed_metadata); + + /* + * + * Encode + * + */ + uint64_p = reinterpret_cast(dbl_arr); + for (size_t i {0}; i < 1024; ++i) { + alp_bench::patas::PatasCompression::Store(uint64_p[i], patas_state); + } + + // Init decoding + byte_reader.SetStream(data_arr); + + /* + * + * DECODE + * + */ + + // UNPACKING METADATA (16 bits - 3 bytes) + for (idx_t i = 0; i < 1024; i++) { + alp_bench::PackedDataUtils::Unpack(((uint16_t*)packed_metadata)[i], + (alp_bench::UnpackedData&)unpacked_data[i]); + } + + // USING UNPACKED METADATA AND DATA BUFFER WE LOAD THE DOUBLE VALUES + dec_arr[0] = (uint64_t)0; // Not sure why without this, it does not work on the > 2nd iteration... + for (idx_t i = 0; i < 1024; i++) { + dec_arr[i] = alp_bench::patas::PatasDecompression::DecompressValue( + byte_reader, + unpacked_data[i].significant_bytes, + unpacked_data[i].trailing_zeros, + dec_arr[i - unpacked_data[i].index_diff]); + } + + dec_dbl_p = reinterpret_cast(dec_arr); + + std::cout << dataset.name << std::endl; + for (size_t i = 0; i < 1024; ++i) { + ASSERT_EQ(dbl_arr[i], dec_dbl_p[i]); + } + + ifile.close(); + } +} \ No newline at end of file diff --git a/data/datasets_transformer.ipynb b/data/datasets_transformer.ipynb new file mode 100644 index 0000000..72106c0 --- /dev/null +++ b/data/datasets_transformer.ipynb @@ -0,0 +1,900 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "04eb7175", + "metadata": {}, + "source": [ + "# Datasets Preprocessing\n", + "\n", + "This Notebook contains the preprocessing of the raw datasets after being downloaded from the web. Our benchmarks and scripts expects a `.bin` file with only the doubles (no column header) in their IEEE 754 (`double`) representation. This notebook transforms each of the 30 datasets into this format.\n", + "\n", + "- Version: 0.0.1 \n", + "- Last Edited: 15/04/2023\n", + "\n", + "## Dependencies\n", + "To use this notebook you need Python (>3.7), Jupyter Notebook and the Libraries `pandas` and `numpy` installed (https://pandas.pydata.org/docs/). To install them, execute the next two cell." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a7bc9d10", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install pandas" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "89e3cac2", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install numpy" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "7d408352", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import os" + ] + }, + { + "cell_type": "markdown", + "id": "d566b71d", + "metadata": {}, + "source": [ + "# NEON Datasets\n", + "\n", + "When downloaded and unzipped, these datasets are scattered across many directories and files. The following cells process every directory structure to build the dataset" + ] + }, + { + "cell_type": "markdown", + "id": "eea1fded", + "metadata": {}, + "source": [ + "## PM10-Dust\n", + "Download URL: https://doi.org/10.48443/4E6X-V373" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "008721b5", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.DataFrame({'PM10Median': []})\n", + "root = './NEON_size-dust-particulate/'\n", + "dates = os.listdir(root)\n", + "dates.sort()\n", + "for date in dates:\n", + " curr_path = root + date + '/'\n", + " if not os.path.isdir(curr_path): continue\n", + " enter = False\n", + " for filename in os.listdir(curr_path):\n", + " if '30_minutes' in filename and filename.endswith('.csv') and ('basic.2020' in filename or 'basic.2021' in filename):\n", + " enter = True\n", + " filename_path = curr_path + filename\n", + " df_tmp = pd.read_csv(filename_path)\n", + " df_tmp = df_tmp[['PM10Median']].dropna()\n", + " df = pd.concat([df, df_tmp], copy=False)\n", + " if enter == False:\n", + " print(curr_path)\n", + "df['PM10Median'].values.tofile('neon_pm10_dust.bin')\n" + ] + }, + { + "cell_type": "markdown", + "id": "05b6fa55", + "metadata": {}, + "source": [ + "## Dew-Point-Temp\n", + "\n", + "Download URL: https://doi.org/10.48443/Z99V-0502" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "0a7814a9", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.DataFrame({'dewTempMean': []})\n", + "root = './NEON_rel-humidity-buoy/'\n", + "dates = os.listdir(root)\n", + "dates.sort()\n", + "for date in dates:\n", + " curr_path = root + date + '/'\n", + " if not os.path.isdir(curr_path): continue\n", + " enter = False\n", + " for filename in os.listdir(curr_path):\n", + " if '1min' in filename and filename.endswith('.csv') and ('basic.2020' in filename or 'basic.2021' in filename):\n", + " enter = True\n", + " filename_path = curr_path + filename\n", + " df_tmp = pd.read_csv(filename_path)\n", + " df_tmp = df_tmp[['dewTempMean']].dropna()\n", + " df = pd.concat([df, df_tmp], copy=False)\n", + " if enter == False:\n", + " print(curr_path)\n", + "df['dewTempMean'].values.tofile('neon_dew_point_temp.bin')\n" + ] + }, + { + "cell_type": "markdown", + "id": "c50e94d8", + "metadata": {}, + "source": [ + "## Air Pressure\n", + "\n", + "Download URL: https://doi.org/10.48443/RXR7-PP32" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "ef09da27", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.DataFrame({'staPresMean': []})\n", + "root = './NEON_pressure-air/'\n", + "dates = os.listdir(root)\n", + "dates.sort()\n", + "for date in dates:\n", + " curr_path = root + date + '/'\n", + " if not os.path.isdir(curr_path): continue\n", + " enter = False\n", + " for filename in os.listdir(curr_path):\n", + " if '1min' in filename and filename.endswith('.csv') and ('basic.2020' in filename or 'basic.2021' in filename):\n", + " enter = True\n", + " filename_path = curr_path + filename\n", + " df_tmp = pd.read_csv(filename_path)\n", + " df_tmp = df_tmp[['staPresMean']].dropna()\n", + " df = pd.concat([df, df_tmp], copy=False)\n", + " if enter == False:\n", + " print(curr_path)\n", + "df['staPresMean'].values.tofile('neon_air_pressure.bin')\n" + ] + }, + { + "cell_type": "markdown", + "id": "13e502a6", + "metadata": {}, + "source": [ + "## Wind Direction\n", + "\n", + "Download URL: https://doi.org/10.48443/S9YA-ZC81" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "fd1c0bc8", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.DataFrame({'windDirMean': []})\n", + "root = './NEON_wind-2d/'\n", + "dates = os.listdir(root)\n", + "dates.sort()\n", + "for date in dates:\n", + " curr_path = root + date + '/'\n", + " if not os.path.isdir(curr_path): continue\n", + " enter = False\n", + " for filename in os.listdir(curr_path):\n", + " if '2min' in filename and filename.endswith('.csv') and ('basic.2020' in filename or 'basic.2021' in filename):\n", + " enter = True\n", + " filename_path = curr_path + filename\n", + " df_tmp = pd.read_csv(filename_path)\n", + " df_tmp = df_tmp[['windDirMean']].dropna()\n", + " df = pd.concat([df, df_tmp], copy=False)\n", + " if enter == False:\n", + " print(curr_path)\n", + "df['windDirMean'].values.tofile('neon_wind_dir.bin')\n" + ] + }, + { + "cell_type": "markdown", + "id": "67b89c44", + "metadata": {}, + "source": [ + "## IR-Bio-Temp\n", + "\n", + "Download URL: https://doi.org/10.48443/JNWY-B177" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "93b4125c", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.DataFrame({'bioTempMean': []})\n", + "root = './NEON_temp-bio/'\n", + "dates = os.listdir(root)\n", + "dates.sort()\n", + "for date in dates:\n", + " curr_path = root + date + '/'\n", + " if not os.path.isdir(curr_path): continue\n", + " enter = False\n", + " for filename in os.listdir(curr_path):\n", + " if '1_minute' in filename and filename.endswith('.csv') and ('basic.2020' in filename or 'basic.2021' in filename):\n", + " enter = True\n", + " filename_path = curr_path + filename\n", + " df_tmp = pd.read_csv(filename_path)\n", + " df_tmp = df_tmp[['bioTempMean']].dropna()\n", + " df = pd.concat([df, df_tmp], copy=False)\n", + " if enter == False:\n", + " print(curr_path)\n", + "df['bioTempMean'].values.tofile('neon_bio_temp.bin')\n" + ] + }, + { + "cell_type": "markdown", + "id": "e6bac3ce", + "metadata": {}, + "source": [ + "# STOCKS Datasets\n", + "\n", + "When downloaded and unzipped, these datasets are scattered across many directories and files. The following cells process every directory structure to build the dataset" + ] + }, + { + "cell_type": "markdown", + "id": "650204b2", + "metadata": {}, + "source": [ + "## Stocks USA / DE / UK\n", + "\n", + "Download URL: https://zenodo.org/record/3886895#%23.ZDBBKuxBz0r\n", + "\n", + "- Create the directory with the name inside the variable `tmp_dir`. We will save temporary CSVs on this directory that will then be joined (we do this to optimize the process)" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "c72f0537", + "metadata": {}, + "outputs": [], + "source": [ + "countries = ['Germany', 'United Kingdom', 'USA']\n", + "tmp_dir = 'stocks_tmp'" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "id": "b01cc3b2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "01012019\n", + "01022019\n", + "01032019\n", + "01042019\n", + "01012019\n", + "01022019\n", + "01032019\n", + "01042019\n", + "01012019\n", + "01022019\n", + "01032019\n", + "01042019\n" + ] + } + ], + "source": [ + "for country in countries:\n", + " file_prefix = 'Stocks ' + country\n", + " df = pd.DataFrame({2: []})\n", + " root = './FinancialData/quotes/'\n", + " dates = os.listdir(root)\n", + " dates.sort()\n", + " x = 0\n", + " for date in dates:\n", + " df_date = pd.DataFrame({2: []})\n", + " if not date.isdigit(): continue\n", + " curr_path = root + date + '/'\n", + " print(date)\n", + " for filename in os.listdir(curr_path):\n", + " if filename.startswith(file_prefix) and filename.endswith('.zip'):\n", + " filename_path = curr_path + filename\n", + " df_tmp = pd.read_csv(filename_path, compression='zip', header=None)\n", + " df_tmp = df_tmp[[2]]\n", + " df_date = pd.concat([df_date, df_tmp], copy=False)\n", + " df_date.to_csv('./' + tmp_dir + '/stocks_' + country + '_' + date + \".csv\", index=False, header=['value'])\n", + " if x == 3:\n", + " break\n", + " x+=1\n", + " dates = os.listdir(tmp_dir)\n", + " dates.sort()\n", + " df_list = []\n", + " for date in dates:\n", + " if country not in date:\n", + " continue\n", + " curr_path = tmp_dir + '/' + date\n", + " df = pd.read_csv(curr_path)\n", + " df_list.append(df)\n", + " df_final = pd.concat(df_list)\n", + " df_final['value'].values.tofile('stocks_' + country + '.bin')" + ] + }, + { + "cell_type": "markdown", + "id": "4634ba80", + "metadata": {}, + "source": [ + "# Public BI Benchmark\n", + "Reference: https://github.com/cwida/public_bi_benchmark" + ] + }, + { + "cell_type": "markdown", + "id": "6483ad33", + "metadata": {}, + "source": [ + "## CommonGov\n", + "Download URL: https://homepages.cwi.nl/~boncz/PublicBIbenchmark/CommonGovernment/\n", + "\n", + "Download each of the files inside the URL" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "4b472225", + "metadata": {}, + "outputs": [], + "source": [ + "gov1 = pd.read_csv('./CommonGovernment_1.csv.bz2', header=None, sep='|', usecols=[9, 25, 29, 30, 39])\n", + "gov2 = pd.read_csv('./CommonGovernment_2.csv.bz2', header=None, sep='|', usecols=[9, 25, 29, 30, 39])\n", + "gov3 = pd.read_csv('./CommonGovernment_3.csv.bz2', header=None, sep='|', usecols=[9, 25, 29, 30, 39])\n", + "gov4 = pd.read_csv('./CommonGovernment_4.csv.bz2', header=None, sep='|', usecols=[9, 25, 29, 30, 39])\n", + "gov5 = pd.read_csv('./CommonGovernment_5.csv.bz2', header=None, sep='|', usecols=[9, 25, 29, 30, 39])\n", + "gov6 = pd.read_csv('./CommonGovernment_6.csv.bz2', header=None, sep='|', usecols=[9, 25, 29, 30, 39])\n", + "gov7 = pd.read_csv('./CommonGovernment_7.csv.bz2', header=None, sep='|', usecols=[9, 25, 29, 30, 39])\n", + "gov8 = pd.read_csv('./CommonGovernment_8.csv.bz2', header=None, sep='|', usecols=[9, 25, 29, 30, 39])\n", + "gov9 = pd.read_csv('./CommonGovernment_9.csv.bz2', header=None, sep='|', usecols=[9, 25, 29, 30, 39])\n", + "gov10 = pd.read_csv('./CommonGovernment_10.csv.bz2', header=None, sep='|', usecols=[9, 25, 29, 30, 39])\n", + "gov11 = pd.read_csv('./CommonGovernment_11.csv.bz2', header=None, sep='|', usecols=[9, 25, 29, 30, 39])\n", + "gov12 = pd.read_csv('./CommonGovernment_12.csv.bz2', header=None, sep='|', usecols=[9, 25, 29, 30, 39])\n", + "gov13 = pd.read_csv('./CommonGovernment_13.csv.bz2', header=None, sep='|', usecols=[9, 25, 29, 30, 39])\n", + "gov = pd.concat([gov1, gov2, gov3, gov4, gov5, gov6, gov7, gov8, gov9, gov10, gov11, gov12, gov13])\n", + "gov = gov.dropna()\n", + "gov[9].values.tofile('gov10.bin')\n", + "gov[25].values.tofile('gov26.bin')\n", + "gov[29].values.tofile('gov30.bin') \n", + "gov[30].values.tofile('gov31.bin') \n", + "gov[39].values.tofile('gov40.bin')" + ] + }, + { + "cell_type": "markdown", + "id": "18588ffd", + "metadata": {}, + "source": [ + "# Medicare\n", + "Download URL: https://homepages.cwi.nl/~boncz/PublicBIbenchmark/Medicare3/" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "6142f975", + "metadata": {}, + "outputs": [], + "source": [ + "medicare1 = pd.read_csv('./Medicare3_1.csv.bz2', header=None, sep='|', usecols=[0, 8], dtype=np.float64)\n", + "medicare = medicare1.dropna()\n", + "medicare[0].values.tofile('medicare1.bin')\n", + "medicare[8].values.tofile('medicare9.bin')" + ] + }, + { + "cell_type": "markdown", + "id": "c97db032", + "metadata": {}, + "source": [ + "# CMSProvider\n", + "\n", + "Download URL: https://homepages.cwi.nl/~boncz/PublicBIbenchmark/CMSprovider/ \n", + "\n", + "Download each of the files inside the URL" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "d3b5224f", + "metadata": {}, + "outputs": [], + "source": [ + "cms1 = pd.read_csv('./CMSprovider_1.csv.bz2', header=None, sep='|', usecols=[0, 8, 24], dtype=np.float64)\n", + "cms2 = pd.read_csv('./CMSprovider_2.csv.bz2', header=None, sep='|', usecols=[0, 8, 24], dtype=np.float64)\n", + "cms = pd.concat([cms1, cms2])\n", + "cms = cms.dropna()\n", + "cms[0].values.tofile('cms1.bin')\n", + "cms[8].values.tofile('cms9.bin')\n", + "cms[24].values.tofile('cms25.bin') " + ] + }, + { + "cell_type": "markdown", + "id": "cd49b170", + "metadata": {}, + "source": [ + "# NYC\n", + "\n", + "Download URL: https://homepages.cwi.nl/~boncz/PublicBIbenchmark/NYC/\n", + "\n", + "Download each of the files inside the URL" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "74820721", + "metadata": {}, + "outputs": [], + "source": [ + "nyc1 = pd.read_csv('./NYC_1.csv.bz2', header=None, sep='|', usecols=[28])\n", + "nyc2 = pd.read_csv('./NYC_2.csv.bz2', header=None, sep='|', usecols=[28])\n", + "nyc = pd.concat([nyc1, nyc2])\n", + "nyc = nyc.dropna()\n", + "nyc[28].values.tofile('nyc29.bin')" + ] + }, + { + "cell_type": "markdown", + "id": "f2b579d7", + "metadata": {}, + "source": [ + "# Arade\n", + "\n", + "Download URL: https://homepages.cwi.nl/~boncz/PublicBIbenchmark/Arade/ \n", + "\n", + "Download each of the files inside the URL" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "0ac52650", + "metadata": {}, + "outputs": [], + "source": [ + "arade = pd.read_csv('./Arade_1.csv.bz2', sep='|', header=None)\n", + "arade[3].values.tofile('arade4.bin')" + ] + }, + { + "cell_type": "markdown", + "id": "7bfbbe94", + "metadata": {}, + "source": [ + "# The Other Datasets" + ] + }, + { + "cell_type": "markdown", + "id": "939a6346", + "metadata": {}, + "source": [ + "# POI (Lat - Lon)\n", + "\n", + "Download URL: https://www.kaggle.com/datasets/ehallmar/points-of-interest-poi-database" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "8dac69b1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
namelatitude_radianlongitude_radiannum_linkslinksnum_categoriescategories
0YAYCHI, WEST AZERBAIJAN0.6831750.77805313Baba Jik Rural District; West Azerbaijan Provi...1POPULATED PLACES IN CHALDORAN COUNTY
1MOUNT FISKE GLACIER0.648196-2.0711149Mount Fiske; Mount Warlow Glacier; U.S. state;...3GLACIERS OF THE SIERRA NEVADA (U.S.); GLACIERS...
2ALATONA0.258356-0.10360610Diabaly; Alatona Irrigation Project; Mali; Nio...2POPULATED PLACES IN SÉGOU REGION; IRRIGATION P...
3PEMBA ISLAND-0.0901750.69435043Malaysia Airlines Flight 370; Arusha; Chake Ch...5PEMBA ISLAND; ISLANDS OF TANZANIA; ISLANDS OF ...
4MBOLO0.1495170.3598296UTC; Sub-prefectures of the Central African Re...2N'DÉLÉ; POPULATED PLACES IN BAMINGUI-BANGORAN
........................
424200PITTSFIELD TOWNSHIP, LORAIN COUNTY, OHIO0.719842-1.43491328Oberlin, Ohio; List of sovereign states; Civil...1TOWNSHIPS IN LORAIN COUNTY, OHIO
424201KEVIN BARTLETT RESERVE-0.6602972.5311499Burnley, Victoria; Collingwood City F.C.; Rich...2SOCCER VENUES IN AUSTRALIA; SPORTS VENUES IN M...
424202WEST SPRINGFIELD, VIRGINIA0.676984-1.34796635Fairfax County, Virginia; List of sovereign st...4CENSUS-DESIGNATED PLACES IN FAIRFAX COUNTY, VI...
424203GLEN ROCK HISTORIC DISTRICT0.694515-1.3392406York County, Pennsylvania; Historic district (...4HISTORIC DISTRICTS ON THE NATIONAL REGISTER OF...
424204PURSAT PROVINCE0.2187481.81368833Cambodia; List of sovereign states; Phnom Penh...2PROVINCES OF CAMBODIA; PURSAT PROVINCE
\n", + "

424205 rows × 7 columns

\n", + "
" + ], + "text/plain": [ + " name latitude_radian \\\n", + "0 YAYCHI, WEST AZERBAIJAN 0.683175 \n", + "1 MOUNT FISKE GLACIER 0.648196 \n", + "2 ALATONA 0.258356 \n", + "3 PEMBA ISLAND -0.090175 \n", + "4 MBOLO 0.149517 \n", + "... ... ... \n", + "424200 PITTSFIELD TOWNSHIP, LORAIN COUNTY, OHIO 0.719842 \n", + "424201 KEVIN BARTLETT RESERVE -0.660297 \n", + "424202 WEST SPRINGFIELD, VIRGINIA 0.676984 \n", + "424203 GLEN ROCK HISTORIC DISTRICT 0.694515 \n", + "424204 PURSAT PROVINCE 0.218748 \n", + "\n", + " longitude_radian num_links \\\n", + "0 0.778053 13 \n", + "1 -2.071114 9 \n", + "2 -0.103606 10 \n", + "3 0.694350 43 \n", + "4 0.359829 6 \n", + "... ... ... \n", + "424200 -1.434913 28 \n", + "424201 2.531149 9 \n", + "424202 -1.347966 35 \n", + "424203 -1.339240 6 \n", + "424204 1.813688 33 \n", + "\n", + " links num_categories \\\n", + "0 Baba Jik Rural District; West Azerbaijan Provi... 1 \n", + "1 Mount Fiske; Mount Warlow Glacier; U.S. state;... 3 \n", + "2 Diabaly; Alatona Irrigation Project; Mali; Nio... 2 \n", + "3 Malaysia Airlines Flight 370; Arusha; Chake Ch... 5 \n", + "4 UTC; Sub-prefectures of the Central African Re... 2 \n", + "... ... ... \n", + "424200 Oberlin, Ohio; List of sovereign states; Civil... 1 \n", + "424201 Burnley, Victoria; Collingwood City F.C.; Rich... 2 \n", + "424202 Fairfax County, Virginia; List of sovereign st... 4 \n", + "424203 York County, Pennsylvania; Historic district (... 4 \n", + "424204 Cambodia; List of sovereign states; Phnom Penh... 2 \n", + "\n", + " categories \n", + "0 POPULATED PLACES IN CHALDORAN COUNTY \n", + "1 GLACIERS OF THE SIERRA NEVADA (U.S.); GLACIERS... \n", + "2 POPULATED PLACES IN SÉGOU REGION; IRRIGATION P... \n", + "3 PEMBA ISLAND; ISLANDS OF TANZANIA; ISLANDS OF ... \n", + "4 N'DÉLÉ; POPULATED PLACES IN BAMINGUI-BANGORAN \n", + "... ... \n", + "424200 TOWNSHIPS IN LORAIN COUNTY, OHIO \n", + "424201 SOCCER VENUES IN AUSTRALIA; SPORTS VENUES IN M... \n", + "424202 CENSUS-DESIGNATED PLACES IN FAIRFAX COUNTY, VI... \n", + "424203 HISTORIC DISTRICTS ON THE NATIONAL REGISTER OF... \n", + "424204 PROVINCES OF CAMBODIA; PURSAT PROVINCE \n", + "\n", + "[424205 rows x 7 columns]" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "poi = pd.read_csv('./poi.csv')\n", + "poi['latitude_radian'].values.tofile('poi_lat.bin')\n", + "poi['longitude_radian'].values.tofile('poi_lon.bin')" + ] + }, + { + "cell_type": "markdown", + "id": "9c5b301c", + "metadata": {}, + "source": [ + "## Food Prices\n", + "\n", + "Download URL: https://data.humdata.org/dataset/wfp-food-prices" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d3607786", + "metadata": {}, + "outputs": [], + "source": [ + "food_prices_df = pd.read_csv('wfpvam_foodprices.csv')\n", + "food_prices_df['mp_price'].values.tofile('food_prices.bin')" + ] + }, + { + "cell_type": "markdown", + "id": "0b3545f3", + "metadata": {}, + "source": [ + "## Bird-migration\n", + "\n", + "Download URL: https://github.com/influxdata/influxdb2-sample-data/blob/master/bird-migration-data/bird-migration.csv" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "28bb3fae", + "metadata": {}, + "outputs": [], + "source": [ + "bird_migration_df = pd.read_csv('./bird_migration.csv', skiprows=3)\n", + "bird_migration_df['_value'].values.tofile('bird_migration_f.bin')" + ] + }, + { + "cell_type": "markdown", + "id": "325b8afa", + "metadata": {}, + "source": [ + "## Bitcoin-price\n", + "\n", + "Download URL: https://raw.githubusercontent.com/influxdata/influxdb2-sample-data/master/bitcoin-price-data/bitcoin-historical-annotated.csv" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "id": "6835ea57", + "metadata": {}, + "outputs": [], + "source": [ + "bitcoin_price_df = pd.read_csv('./bitcoin.csv', skiprows=3)\n", + "bitcoin_price_df['_value'].values.tofile('bitcoin_f.bin')" + ] + }, + { + "cell_type": "markdown", + "id": "320c3421", + "metadata": {}, + "source": [ + "## Bitcoin Transaction\n", + "\n", + "Download URL: https://gz.blockchair.com/bitcoin/transactions/\n", + "\n", + "Search and download for day 2022/03/26, or directly download using this link:\n", + "https://gz.blockchair.com/bitcoin/transactions/blockchair_bitcoin_transactions_20220326.tsv.gz" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "7c363e95", + "metadata": {}, + "outputs": [], + "source": [ + "bitcoin_tr_df = pd.read_csv('./bitcoin_transactions.tsv.gz', delimiter='\\t', compression='gzip')\n", + "bitcoin_tr_df['output_total_usd'].values.tofile('bitcoin_transactions_f.bin')" + ] + }, + { + "cell_type": "markdown", + "id": "839191d7", + "metadata": {}, + "source": [ + "## SSD-HDD Benchmark\n", + "\n", + "Download URL: https://www.kaggle.com/datasets/alanjo/ssd-and-hdd-benchmarks" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "14f2e4ed", + "metadata": {}, + "outputs": [], + "source": [ + "ssd_df = pd.read_csv('SSD_HDD_benchmarks_v9.csv')\n", + "ssd_df['diskCapacity'].values.tofile('ssd_hdd_benchmarks_f.bin')" + ] + }, + { + "cell_type": "markdown", + "id": "312b6dc6", + "metadata": {}, + "source": [ + "## City Temperature\n", + "\n", + "Download URL: https://www.kaggle.com/datasets/sudalairajkumar/daily-temperature-of-major-cities" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d70a4c57", + "metadata": {}, + "outputs": [], + "source": [ + "city_temp_df = pd.read_csv('city_temperature.csv.gz', compression='gzip', header=None)\n", + "city_temp_df[2].values.tofile('city_temperature_f.bin')" + ] + }, + { + "cell_type": "markdown", + "id": "668e82f7", + "metadata": {}, + "source": [ + "## Basel-Temp & Wind\n", + "\n", + "Download URL: https://www.meteoblue.com/en/weather/archive/export/basel_switzerland\n", + "\n", + "- Date filters: 2008-01-01 until 2022-01-31 \n", + "- Boxes selected: Temperature and Wind Speed (10m)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "93aad8e0", + "metadata": {}, + "outputs": [], + "source": [ + "basel = pd.read_csv('dataexport_20230415T103128.csv', header=None, skiprows=10)\n", + "basel_wind = basel[2].values.tofile('basel_wind_t.bin')\n", + "basel_temp = basel[1].values.tofile('basel_temp.bin')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/data/edge_case/edge_case.csv b/data/edge_case/edge_case.csv new file mode 100644 index 0000000..757b99d --- /dev/null +++ b/data/edge_case/edge_case.csv @@ -0,0 +1,1025 @@ +-INFINITY +-INFINITY +-0.0 +-0.0 +-0.0 +inf +INFINITY +Inf +nan +Nan +NaN +NAN +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 + diff --git a/data/generated/generated_doubles_bw0.csv b/data/generated/generated_doubles_bw0.csv new file mode 100644 index 0000000..d046170 --- /dev/null +++ b/data/generated/generated_doubles_bw0.csv @@ -0,0 +1,1024 @@ +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 \ No newline at end of file diff --git a/data/generated/generated_doubles_bw1.csv b/data/generated/generated_doubles_bw1.csv new file mode 100644 index 0000000..a930d4b --- /dev/null +++ b/data/generated/generated_doubles_bw1.csv @@ -0,0 +1,1024 @@ +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +1.0 +0 \ No newline at end of file diff --git a/data/generated/generated_doubles_bw10.csv b/data/generated/generated_doubles_bw10.csv new file mode 100644 index 0000000..fc9c307 --- /dev/null +++ b/data/generated/generated_doubles_bw10.csv @@ -0,0 +1,1024 @@ +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +1023.0 +0 \ No newline at end of file diff --git a/data/generated/generated_doubles_bw11.csv b/data/generated/generated_doubles_bw11.csv new file mode 100644 index 0000000..93f6041 --- /dev/null +++ b/data/generated/generated_doubles_bw11.csv @@ -0,0 +1,1024 @@ +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +2047.0 +0 \ No newline at end of file diff --git a/data/generated/generated_doubles_bw12.csv b/data/generated/generated_doubles_bw12.csv new file mode 100644 index 0000000..a641076 --- /dev/null +++ b/data/generated/generated_doubles_bw12.csv @@ -0,0 +1,1024 @@ +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +4095.0 +0 \ No newline at end of file diff --git a/data/generated/generated_doubles_bw13.csv b/data/generated/generated_doubles_bw13.csv new file mode 100644 index 0000000..3b89f5a --- /dev/null +++ b/data/generated/generated_doubles_bw13.csv @@ -0,0 +1,1024 @@ +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +8191.0 +0 \ No newline at end of file diff --git a/data/generated/generated_doubles_bw14.csv b/data/generated/generated_doubles_bw14.csv new file mode 100644 index 0000000..a396ee1 --- /dev/null +++ b/data/generated/generated_doubles_bw14.csv @@ -0,0 +1,1024 @@ +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +16383.0 +0 \ No newline at end of file diff --git a/data/generated/generated_doubles_bw15.csv b/data/generated/generated_doubles_bw15.csv new file mode 100644 index 0000000..4474cdc --- /dev/null +++ b/data/generated/generated_doubles_bw15.csv @@ -0,0 +1,1024 @@ +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +32767.0 +0 \ No newline at end of file diff --git a/data/generated/generated_doubles_bw16.csv b/data/generated/generated_doubles_bw16.csv new file mode 100644 index 0000000..6efe711 --- /dev/null +++ b/data/generated/generated_doubles_bw16.csv @@ -0,0 +1,1024 @@ +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +65535.0 +0 \ No newline at end of file diff --git a/data/generated/generated_doubles_bw17.csv b/data/generated/generated_doubles_bw17.csv new file mode 100644 index 0000000..8913841 --- /dev/null +++ b/data/generated/generated_doubles_bw17.csv @@ -0,0 +1,1024 @@ +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +131071.0 +0 \ No newline at end of file diff --git a/data/generated/generated_doubles_bw18.csv b/data/generated/generated_doubles_bw18.csv new file mode 100644 index 0000000..64cd0c0 --- /dev/null +++ b/data/generated/generated_doubles_bw18.csv @@ -0,0 +1,1024 @@ +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +262143.0 +0 \ No newline at end of file diff --git a/data/generated/generated_doubles_bw19.csv b/data/generated/generated_doubles_bw19.csv new file mode 100644 index 0000000..4d1e5a2 --- /dev/null +++ b/data/generated/generated_doubles_bw19.csv @@ -0,0 +1,1024 @@ +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +524287.0 +0 \ No newline at end of file diff --git a/data/generated/generated_doubles_bw2.csv b/data/generated/generated_doubles_bw2.csv new file mode 100644 index 0000000..b04bcb7 --- /dev/null +++ b/data/generated/generated_doubles_bw2.csv @@ -0,0 +1,1024 @@ +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +3.0 +0 \ No newline at end of file diff --git a/data/generated/generated_doubles_bw20.csv b/data/generated/generated_doubles_bw20.csv new file mode 100644 index 0000000..4a65e9d --- /dev/null +++ b/data/generated/generated_doubles_bw20.csv @@ -0,0 +1,1024 @@ +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +1048575.0 +0 \ No newline at end of file diff --git a/data/generated/generated_doubles_bw21.csv b/data/generated/generated_doubles_bw21.csv new file mode 100644 index 0000000..be92d24 --- /dev/null +++ b/data/generated/generated_doubles_bw21.csv @@ -0,0 +1,1024 @@ +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +2097151.0 +0 \ No newline at end of file diff --git a/data/generated/generated_doubles_bw22.csv b/data/generated/generated_doubles_bw22.csv new file mode 100644 index 0000000..1ac909f --- /dev/null +++ b/data/generated/generated_doubles_bw22.csv @@ -0,0 +1,1024 @@ +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +4194303.0 +0 \ No newline at end of file diff --git a/data/generated/generated_doubles_bw23.csv b/data/generated/generated_doubles_bw23.csv new file mode 100644 index 0000000..b482f28 --- /dev/null +++ b/data/generated/generated_doubles_bw23.csv @@ -0,0 +1,1024 @@ +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +8388607.0 +0 \ No newline at end of file diff --git a/data/generated/generated_doubles_bw24.csv b/data/generated/generated_doubles_bw24.csv new file mode 100644 index 0000000..50d5d4e --- /dev/null +++ b/data/generated/generated_doubles_bw24.csv @@ -0,0 +1,1024 @@ +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +16777215.0 +0 \ No newline at end of file diff --git a/data/generated/generated_doubles_bw25.csv b/data/generated/generated_doubles_bw25.csv new file mode 100644 index 0000000..4f549c0 --- /dev/null +++ b/data/generated/generated_doubles_bw25.csv @@ -0,0 +1,1024 @@ +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +33554431.0 +0 \ No newline at end of file diff --git a/data/generated/generated_doubles_bw26.csv b/data/generated/generated_doubles_bw26.csv new file mode 100644 index 0000000..d8ac205 --- /dev/null +++ b/data/generated/generated_doubles_bw26.csv @@ -0,0 +1,1024 @@ +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +67108863.0 +0 \ No newline at end of file diff --git a/data/generated/generated_doubles_bw27.csv b/data/generated/generated_doubles_bw27.csv new file mode 100644 index 0000000..93ca062 --- /dev/null +++ b/data/generated/generated_doubles_bw27.csv @@ -0,0 +1,1024 @@ +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +134217727.0 +0 \ No newline at end of file diff --git a/data/generated/generated_doubles_bw28.csv b/data/generated/generated_doubles_bw28.csv new file mode 100644 index 0000000..ea7f8ba --- /dev/null +++ b/data/generated/generated_doubles_bw28.csv @@ -0,0 +1,1024 @@ +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +268435455.0 +0 \ No newline at end of file diff --git a/data/generated/generated_doubles_bw29.csv b/data/generated/generated_doubles_bw29.csv new file mode 100644 index 0000000..abcb5c6 --- /dev/null +++ b/data/generated/generated_doubles_bw29.csv @@ -0,0 +1,1024 @@ +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +536870911.0 +0 \ No newline at end of file diff --git a/data/generated/generated_doubles_bw3.csv b/data/generated/generated_doubles_bw3.csv new file mode 100644 index 0000000..9a3b722 --- /dev/null +++ b/data/generated/generated_doubles_bw3.csv @@ -0,0 +1,1024 @@ +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +7.0 +0 \ No newline at end of file diff --git a/data/generated/generated_doubles_bw30.csv b/data/generated/generated_doubles_bw30.csv new file mode 100644 index 0000000..c179f65 --- /dev/null +++ b/data/generated/generated_doubles_bw30.csv @@ -0,0 +1,1024 @@ +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +1073741823.0 +0 \ No newline at end of file diff --git a/data/generated/generated_doubles_bw31.csv b/data/generated/generated_doubles_bw31.csv new file mode 100644 index 0000000..9dda733 --- /dev/null +++ b/data/generated/generated_doubles_bw31.csv @@ -0,0 +1,1024 @@ +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +2147483647.0 +0 \ No newline at end of file diff --git a/data/generated/generated_doubles_bw32.csv b/data/generated/generated_doubles_bw32.csv new file mode 100644 index 0000000..d271825 --- /dev/null +++ b/data/generated/generated_doubles_bw32.csv @@ -0,0 +1,1024 @@ +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +4294967295.0 +0 \ No newline at end of file diff --git a/data/generated/generated_doubles_bw33.csv b/data/generated/generated_doubles_bw33.csv new file mode 100644 index 0000000..b7ac370 --- /dev/null +++ b/data/generated/generated_doubles_bw33.csv @@ -0,0 +1,1024 @@ +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +8589934591.0 +0 \ No newline at end of file diff --git a/data/generated/generated_doubles_bw34.csv b/data/generated/generated_doubles_bw34.csv new file mode 100644 index 0000000..c4d4525 --- /dev/null +++ b/data/generated/generated_doubles_bw34.csv @@ -0,0 +1,1024 @@ +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +17179869183.0 +0 \ No newline at end of file diff --git a/data/generated/generated_doubles_bw35.csv b/data/generated/generated_doubles_bw35.csv new file mode 100644 index 0000000..405636a --- /dev/null +++ b/data/generated/generated_doubles_bw35.csv @@ -0,0 +1,1024 @@ +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +34359738367.0 +0 \ No newline at end of file diff --git a/data/generated/generated_doubles_bw36.csv b/data/generated/generated_doubles_bw36.csv new file mode 100644 index 0000000..cf24714 --- /dev/null +++ b/data/generated/generated_doubles_bw36.csv @@ -0,0 +1,1024 @@ +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +68719476735.0 +0 \ No newline at end of file diff --git a/data/generated/generated_doubles_bw37.csv b/data/generated/generated_doubles_bw37.csv new file mode 100644 index 0000000..f3e42c2 --- /dev/null +++ b/data/generated/generated_doubles_bw37.csv @@ -0,0 +1,1024 @@ +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +137438953471.0 +0 \ No newline at end of file diff --git a/data/generated/generated_doubles_bw38.csv b/data/generated/generated_doubles_bw38.csv new file mode 100644 index 0000000..74fde9b --- /dev/null +++ b/data/generated/generated_doubles_bw38.csv @@ -0,0 +1,1024 @@ +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +274877906943.0 +0 \ No newline at end of file diff --git a/data/generated/generated_doubles_bw39.csv b/data/generated/generated_doubles_bw39.csv new file mode 100644 index 0000000..f4cbd53 --- /dev/null +++ b/data/generated/generated_doubles_bw39.csv @@ -0,0 +1,1024 @@ +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +549755813887.0 +0 \ No newline at end of file diff --git a/data/generated/generated_doubles_bw4.csv b/data/generated/generated_doubles_bw4.csv new file mode 100644 index 0000000..2b7faa1 --- /dev/null +++ b/data/generated/generated_doubles_bw4.csv @@ -0,0 +1,1024 @@ +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +15.0 +0 \ No newline at end of file diff --git a/data/generated/generated_doubles_bw40.csv b/data/generated/generated_doubles_bw40.csv new file mode 100644 index 0000000..84cafb8 --- /dev/null +++ b/data/generated/generated_doubles_bw40.csv @@ -0,0 +1,1024 @@ +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +1099511627775.0 +0 \ No newline at end of file diff --git a/data/generated/generated_doubles_bw41.csv b/data/generated/generated_doubles_bw41.csv new file mode 100644 index 0000000..f544b4c --- /dev/null +++ b/data/generated/generated_doubles_bw41.csv @@ -0,0 +1,1024 @@ +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +2199023255551.0 +0 \ No newline at end of file diff --git a/data/generated/generated_doubles_bw42.csv b/data/generated/generated_doubles_bw42.csv new file mode 100644 index 0000000..e0cdd0a --- /dev/null +++ b/data/generated/generated_doubles_bw42.csv @@ -0,0 +1,1024 @@ +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +4398046511103.0 +0 \ No newline at end of file diff --git a/data/generated/generated_doubles_bw43.csv b/data/generated/generated_doubles_bw43.csv new file mode 100644 index 0000000..e75ea57 --- /dev/null +++ b/data/generated/generated_doubles_bw43.csv @@ -0,0 +1,1024 @@ +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +8796093022207.0 +0 \ No newline at end of file diff --git a/data/generated/generated_doubles_bw44.csv b/data/generated/generated_doubles_bw44.csv new file mode 100644 index 0000000..6e4c01c --- /dev/null +++ b/data/generated/generated_doubles_bw44.csv @@ -0,0 +1,1024 @@ +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +17592186044415.0 +0 \ No newline at end of file diff --git a/data/generated/generated_doubles_bw45.csv b/data/generated/generated_doubles_bw45.csv new file mode 100644 index 0000000..ed45668 --- /dev/null +++ b/data/generated/generated_doubles_bw45.csv @@ -0,0 +1,1024 @@ +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +35184372088831.0 +0 \ No newline at end of file diff --git a/data/generated/generated_doubles_bw46.csv b/data/generated/generated_doubles_bw46.csv new file mode 100644 index 0000000..83098d8 --- /dev/null +++ b/data/generated/generated_doubles_bw46.csv @@ -0,0 +1,1024 @@ +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +70368744177663.0 +0 \ No newline at end of file diff --git a/data/generated/generated_doubles_bw47.csv b/data/generated/generated_doubles_bw47.csv new file mode 100644 index 0000000..2fca9c6 --- /dev/null +++ b/data/generated/generated_doubles_bw47.csv @@ -0,0 +1,1024 @@ +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +140737488355327.0 +0 \ No newline at end of file diff --git a/data/generated/generated_doubles_bw48.csv b/data/generated/generated_doubles_bw48.csv new file mode 100644 index 0000000..3d68929 --- /dev/null +++ b/data/generated/generated_doubles_bw48.csv @@ -0,0 +1,1024 @@ +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +281474976710655.0 +0 \ No newline at end of file diff --git a/data/generated/generated_doubles_bw49.csv b/data/generated/generated_doubles_bw49.csv new file mode 100644 index 0000000..a7f1f48 --- /dev/null +++ b/data/generated/generated_doubles_bw49.csv @@ -0,0 +1,1024 @@ +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +562949953421311.0 +0 \ No newline at end of file diff --git a/data/generated/generated_doubles_bw5.csv b/data/generated/generated_doubles_bw5.csv new file mode 100644 index 0000000..e951b24 --- /dev/null +++ b/data/generated/generated_doubles_bw5.csv @@ -0,0 +1,1024 @@ +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +0 \ No newline at end of file diff --git a/data/generated/generated_doubles_bw50.csv b/data/generated/generated_doubles_bw50.csv new file mode 100644 index 0000000..4bdc98e --- /dev/null +++ b/data/generated/generated_doubles_bw50.csv @@ -0,0 +1,1024 @@ +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +1125899906842623.0 +0 \ No newline at end of file diff --git a/data/generated/generated_doubles_bw51.csv b/data/generated/generated_doubles_bw51.csv new file mode 100644 index 0000000..5b2d4b5 --- /dev/null +++ b/data/generated/generated_doubles_bw51.csv @@ -0,0 +1,1024 @@ +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +2251799813685247.0 +0 \ No newline at end of file diff --git a/data/generated/generated_doubles_bw52.csv b/data/generated/generated_doubles_bw52.csv new file mode 100644 index 0000000..7dc9ff9 --- /dev/null +++ b/data/generated/generated_doubles_bw52.csv @@ -0,0 +1,1024 @@ +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +4503599627370495.0 +0 \ No newline at end of file diff --git a/data/generated/generated_doubles_bw53.csv b/data/generated/generated_doubles_bw53.csv new file mode 100644 index 0000000..858c54c --- /dev/null +++ b/data/generated/generated_doubles_bw53.csv @@ -0,0 +1,1024 @@ +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +9007199254740991.0 +0 \ No newline at end of file diff --git a/data/generated/generated_doubles_bw54.csv b/data/generated/generated_doubles_bw54.csv new file mode 100644 index 0000000..9befaff --- /dev/null +++ b/data/generated/generated_doubles_bw54.csv @@ -0,0 +1,1024 @@ +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +18014398509481983.0 +0 \ No newline at end of file diff --git a/data/generated/generated_doubles_bw55.csv b/data/generated/generated_doubles_bw55.csv new file mode 100644 index 0000000..d48b6bf --- /dev/null +++ b/data/generated/generated_doubles_bw55.csv @@ -0,0 +1,1024 @@ +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +36028797018963967.0 +0 \ No newline at end of file diff --git a/data/generated/generated_doubles_bw56.csv b/data/generated/generated_doubles_bw56.csv new file mode 100644 index 0000000..d8d9429 --- /dev/null +++ b/data/generated/generated_doubles_bw56.csv @@ -0,0 +1,1024 @@ +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +72057594037927935.0 +0 \ No newline at end of file diff --git a/data/generated/generated_doubles_bw57.csv b/data/generated/generated_doubles_bw57.csv new file mode 100644 index 0000000..6986e14 --- /dev/null +++ b/data/generated/generated_doubles_bw57.csv @@ -0,0 +1,1024 @@ +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +144115188075855871.0 +0 \ No newline at end of file diff --git a/data/generated/generated_doubles_bw58.csv b/data/generated/generated_doubles_bw58.csv new file mode 100644 index 0000000..af89c56 --- /dev/null +++ b/data/generated/generated_doubles_bw58.csv @@ -0,0 +1,1024 @@ +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +288230376151711743.0 +0 \ No newline at end of file diff --git a/data/generated/generated_doubles_bw59.csv b/data/generated/generated_doubles_bw59.csv new file mode 100644 index 0000000..3d6a306 --- /dev/null +++ b/data/generated/generated_doubles_bw59.csv @@ -0,0 +1,1024 @@ +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +576460752303423487.0 +0 \ No newline at end of file diff --git a/data/generated/generated_doubles_bw6.csv b/data/generated/generated_doubles_bw6.csv new file mode 100644 index 0000000..b620eee --- /dev/null +++ b/data/generated/generated_doubles_bw6.csv @@ -0,0 +1,1024 @@ +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +63.0 +0 \ No newline at end of file diff --git a/data/generated/generated_doubles_bw60.csv b/data/generated/generated_doubles_bw60.csv new file mode 100644 index 0000000..7d592c6 --- /dev/null +++ b/data/generated/generated_doubles_bw60.csv @@ -0,0 +1,1024 @@ +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +1152921504606846975.0 +0 \ No newline at end of file diff --git a/data/generated/generated_doubles_bw61.csv b/data/generated/generated_doubles_bw61.csv new file mode 100644 index 0000000..21b8dad --- /dev/null +++ b/data/generated/generated_doubles_bw61.csv @@ -0,0 +1,1024 @@ +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +2305843009213693951.0 +0 \ No newline at end of file diff --git a/data/generated/generated_doubles_bw62.csv b/data/generated/generated_doubles_bw62.csv new file mode 100644 index 0000000..a69eb20 --- /dev/null +++ b/data/generated/generated_doubles_bw62.csv @@ -0,0 +1,1024 @@ +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +4611686018427387903.0 +0 \ No newline at end of file diff --git a/data/generated/generated_doubles_bw63.csv b/data/generated/generated_doubles_bw63.csv new file mode 100644 index 0000000..b8d02ad --- /dev/null +++ b/data/generated/generated_doubles_bw63.csv @@ -0,0 +1,1024 @@ +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +9223372036854775807.0 +0 \ No newline at end of file diff --git a/data/generated/generated_doubles_bw64.csv b/data/generated/generated_doubles_bw64.csv new file mode 100644 index 0000000..538f918 --- /dev/null +++ b/data/generated/generated_doubles_bw64.csv @@ -0,0 +1,1024 @@ +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +18446744073709551615.0 +0 \ No newline at end of file diff --git a/data/generated/generated_doubles_bw7.csv b/data/generated/generated_doubles_bw7.csv new file mode 100644 index 0000000..7ae50cb --- /dev/null +++ b/data/generated/generated_doubles_bw7.csv @@ -0,0 +1,1024 @@ +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +127.0 +0 \ No newline at end of file diff --git a/data/generated/generated_doubles_bw8.csv b/data/generated/generated_doubles_bw8.csv new file mode 100644 index 0000000..00310d0 --- /dev/null +++ b/data/generated/generated_doubles_bw8.csv @@ -0,0 +1,1024 @@ +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +255.0 +0 \ No newline at end of file diff --git a/data/generated/generated_doubles_bw9.csv b/data/generated/generated_doubles_bw9.csv new file mode 100644 index 0000000..24cac34 --- /dev/null +++ b/data/generated/generated_doubles_bw9.csv @@ -0,0 +1,1024 @@ +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +511.0 +0 \ No newline at end of file diff --git a/data/include/column.hpp b/data/include/column.hpp new file mode 100644 index 0000000..891705d --- /dev/null +++ b/data/include/column.hpp @@ -0,0 +1,38 @@ +#ifndef COLUMN_HPP +#define COLUMN_HPP + +#include +#include +#include + +namespace alp_bench { +struct Column { + uint64_t id; + std::string name; + const std::string sample_csv_file_path; + const std::string binary_file_path; + uint8_t factor {0}; + uint16_t exponent {0}; + uint16_t exceptions_count {0}; + uint8_t bit_width {0}; + bool suitable_for_cutting {false}; +}; + +struct paths { + std::string GENERATED_COLUMNS_CSV_PATH = std::string {CMAKE_SOURCE_DIR} + "/data/generated/"; + std::string ALP_DATASET_SAMPLE_CSV_PATH = std::string {CMAKE_SOURCE_DIR} + "/data/samples/"; + std::string EDGE_DATASET_CSV_PATH = std::string {CMAKE_SOURCE_DIR} + "/data/edge_case/"; + std::string RESULT_DIR_PATH = std::string {CMAKE_SOURCE_DIR} + "/publication/"; + std::string ALP_DATASET_BINARY_DIR_PATH = " "; + + explicit paths() { + auto v = std::getenv("ALP_DATASET_DIR_PATH"); + if (v) { ALP_DATASET_BINARY_DIR_PATH = v; } + } +}; + +inline paths PATHS; + +} // namespace alp_bench + +#endif \ No newline at end of file diff --git a/data/include/data.hpp b/data/include/data.hpp new file mode 100644 index 0000000..f7bccdd --- /dev/null +++ b/data/include/data.hpp @@ -0,0 +1,10 @@ +#ifndef DATA_HPP +#define DATA_HPP + +#include "column.hpp" +#include "double_columns.hpp" +#include "edge_case.hpp" +#include "float_columns.hpp" +#include "generated_columns.hpp" + +#endif // DATA_HPP diff --git a/data/include/double_columns.hpp b/data/include/double_columns.hpp new file mode 100644 index 0000000..3ebfdbd --- /dev/null +++ b/data/include/double_columns.hpp @@ -0,0 +1,286 @@ +#ifndef ALP_DOUBLE_COLUMNS_HPP +#define ALP_DOUBLE_COLUMNS_HPP + +#include "column.hpp" + +namespace alp_bench { + +inline std::array alp_dataset = {{ + + {1, + "Air-Pressure", + PATHS.ALP_DATASET_SAMPLE_CSV_PATH + "neon_air_pressure.csv", + PATHS.ALP_DATASET_BINARY_DIR_PATH + "neon_air_pressure.bin", + 14, + 9, + 3, + 16}, + + {2, + "Arade/4", + PATHS.ALP_DATASET_SAMPLE_CSV_PATH + "arade4.csv", + PATHS.ALP_DATASET_BINARY_DIR_PATH + "arade4.bin", + 14, + 10, + 8, + 24}, + + {3, + "Basel-Temp", + PATHS.ALP_DATASET_SAMPLE_CSV_PATH + "basel_temp_f.csv", + PATHS.ALP_DATASET_BINARY_DIR_PATH + "basel_temp_f.bin", + 14, + 7, + 47, + 28}, + + {4, + "Basel-Wind", + PATHS.ALP_DATASET_SAMPLE_CSV_PATH + "basel_wind_f.csv", + PATHS.ALP_DATASET_BINARY_DIR_PATH + "basel_wind_f.bin", + 14, + 7, + 9, + 29}, + + {5, + "Bird-Mig", + PATHS.ALP_DATASET_SAMPLE_CSV_PATH + "bird_migration_f.csv", + PATHS.ALP_DATASET_BINARY_DIR_PATH + "bird_migration_f.bin", + 14, + 9, + 2, + 17}, + + {6, + "Btc-Price", + PATHS.ALP_DATASET_SAMPLE_CSV_PATH + "bitcoin_f.csv", + PATHS.ALP_DATASET_BINARY_DIR_PATH + "bitcoin_f.bin", + 14, + 10, + 10, + 25}, + + {7, + "Blockchain", + PATHS.ALP_DATASET_SAMPLE_CSV_PATH + "bitcoin_transactions_f.csv", + PATHS.ALP_DATASET_BINARY_DIR_PATH + "bitcoin_transactions_f.bin", + 14, + 10, + 11, + 30}, + + {8, + "City-Temp", + PATHS.ALP_DATASET_SAMPLE_CSV_PATH + "city_temperature_f.csv", + PATHS.ALP_DATASET_BINARY_DIR_PATH + "city_temperature_f.bin", + 14, + 13, + 0, + 11}, + + {9, + "CMS/1", + PATHS.ALP_DATASET_SAMPLE_CSV_PATH + "cms1.csv", + PATHS.ALP_DATASET_BINARY_DIR_PATH + "cms1.bin", + 14, + 5, + 10, + 41}, + + {10, + "CMS/9", + PATHS.ALP_DATASET_SAMPLE_CSV_PATH + "cms9.csv", + PATHS.ALP_DATASET_BINARY_DIR_PATH + "cms9.bin", + 16, + 16, + 2, + 10}, + + {11, + "CMS/25", + PATHS.ALP_DATASET_SAMPLE_CSV_PATH + "cms25.csv", + PATHS.ALP_DATASET_BINARY_DIR_PATH + "cms25.bin", + 14, + 4, + 6, + 42}, + + {12, + "Dew-Temp", + PATHS.ALP_DATASET_SAMPLE_CSV_PATH + "neon_dew_point_temp.csv", + PATHS.ALP_DATASET_BINARY_DIR_PATH + "neon_dew_point_temp.bin", + 14, + 11, + 6, + 13}, + + {13, + "Bio-Temp", + PATHS.ALP_DATASET_SAMPLE_CSV_PATH + "neon_bio_temp_c.csv", + PATHS.ALP_DATASET_BINARY_DIR_PATH + "neon_bio_temp_c.bin", + 14, + 12, + 0, + 10}, + + {14, + "Food-prices", + PATHS.ALP_DATASET_SAMPLE_CSV_PATH + "food_prices.csv", + PATHS.ALP_DATASET_BINARY_DIR_PATH + "food_prices.bin", + 16, + 12, + 46, + 20}, + + {15, + "Gov/10", + PATHS.ALP_DATASET_SAMPLE_CSV_PATH + "gov10.csv", + PATHS.ALP_DATASET_BINARY_DIR_PATH + "gov10.bin", + 3, + 1, + 72, + 27}, + + {16, + "Gov/26", + PATHS.ALP_DATASET_SAMPLE_CSV_PATH + "gov26.csv", + PATHS.ALP_DATASET_BINARY_DIR_PATH + "gov26.bin", + 18, + 18, + 0, + 0}, + + {17, + "Gov/30", + PATHS.ALP_DATASET_SAMPLE_CSV_PATH + "gov30.csv", + PATHS.ALP_DATASET_BINARY_DIR_PATH + "gov30.bin", + 18, + 18, + 4, + 0}, + + {18, + "Gov/31", + PATHS.ALP_DATASET_SAMPLE_CSV_PATH + "gov31.csv", + PATHS.ALP_DATASET_BINARY_DIR_PATH + "gov31.bin", + 18, + 18, + 1, + 0}, + + {19, + "Gov/40", + PATHS.ALP_DATASET_SAMPLE_CSV_PATH + "gov40.csv", + PATHS.ALP_DATASET_BINARY_DIR_PATH + "gov40.bin", + 18, + 18, + 3, + 0}, + + {20, + "Medicare/1", + PATHS.ALP_DATASET_SAMPLE_CSV_PATH + "medicare1.csv", + PATHS.ALP_DATASET_BINARY_DIR_PATH + "medicare1.bin", + 14, + 5, + 37, + 38}, + + {21, + "Medicare/9", + PATHS.ALP_DATASET_SAMPLE_CSV_PATH + "medicare9.csv", + PATHS.ALP_DATASET_BINARY_DIR_PATH + "medicare9.bin", + 16, + 16, + 3, + 10}, + + {22, + "PM10-dust", + PATHS.ALP_DATASET_SAMPLE_CSV_PATH + "neon_pm10_dust.csv", + PATHS.ALP_DATASET_BINARY_DIR_PATH + "neon_pm10_dust.bin", + 14, + 11, + 0, + 8}, + + {23, + "NYC/29", + PATHS.ALP_DATASET_SAMPLE_CSV_PATH + "nyc29.csv", + PATHS.ALP_DATASET_BINARY_DIR_PATH + "nyc29.bin", + 14, + 1, + 5, + 42}, + + {24, + "POI-lat", + PATHS.ALP_DATASET_SAMPLE_CSV_PATH + "poi_lat.csv", + PATHS.ALP_DATASET_BINARY_DIR_PATH + "poi_lat.bin", + 16, + 0, + 157, + 55, + true}, + + {25, + "POI-lon", + PATHS.ALP_DATASET_SAMPLE_CSV_PATH + "poi_lon.csv", + PATHS.ALP_DATASET_BINARY_DIR_PATH + "poi_lon.bin", + 16, + 0, + 199, + 56, + true}, + + {26, + "SD-bench", + PATHS.ALP_DATASET_SAMPLE_CSV_PATH + "ssd_hdd_benchmarks_f.csv", + PATHS.ALP_DATASET_BINARY_DIR_PATH + "ssd_hdd_benchmarks_f.bin", + 14, + 13, + 0, + 17}, + + {27, + "Stocks-DE", + PATHS.ALP_DATASET_SAMPLE_CSV_PATH + "stocks_de.csv", + PATHS.ALP_DATASET_BINARY_DIR_PATH + "stocks_de.bin", + 14, + 11, + 5, + 10 + + }, + + {28, + "Stocks-UK", + PATHS.ALP_DATASET_SAMPLE_CSV_PATH + "stocks_uk.csv", + PATHS.ALP_DATASET_BINARY_DIR_PATH + "stocks_uk.bin", + 14, + 13, + 0, + 9}, + + {29, + "Stocks-USA", + PATHS.ALP_DATASET_SAMPLE_CSV_PATH + "stocks_usa_c.csv", + PATHS.ALP_DATASET_BINARY_DIR_PATH + "stocks_usa_c.bin", + 14, + 12, + 0, + 7}, + + {30, + "Wind-dir", + PATHS.ALP_DATASET_SAMPLE_CSV_PATH + "neon_wind_dir.csv", + PATHS.ALP_DATASET_BINARY_DIR_PATH + "neon_wind_dir.bin", + 14, + 12, + 0, + 16}, + +}}; +} // namespace alp_bench +#endif \ No newline at end of file diff --git a/data/include/edge_case.hpp b/data/include/edge_case.hpp new file mode 100644 index 0000000..d40c1be --- /dev/null +++ b/data/include/edge_case.hpp @@ -0,0 +1,13 @@ +#ifndef ALP_EDGE_CASE_HPP +#define ALP_EDGE_CASE_HPP + +#include "column.hpp" + +namespace alp_bench { +inline std::array edge_case = {{ + + {1, "edge_case", PATHS.EDGE_DATASET_CSV_PATH + "edge_case.csv", "", 0, 0, 12, 0, true}, + +}}; +} // namespace alp_bench +#endif \ No newline at end of file diff --git a/data/include/float_columns.hpp b/data/include/float_columns.hpp new file mode 100644 index 0000000..5c3492f --- /dev/null +++ b/data/include/float_columns.hpp @@ -0,0 +1,19 @@ +#ifndef ALP_FLOAT_COLUMNS_HPP +#define ALP_FLOAT_COLUMNS_HPP + +#include "column.hpp" + +namespace alp_bench { +inline std::array sp_datasets = {{ + + {1, "Dino-Vitb16", "", PATHS.ALP_DATASET_BINARY_DIR_PATH + "sp_dino_vitb16.bin", 0, 0, 0, 0, true}, + + {2, "GPT2", "", PATHS.ALP_DATASET_BINARY_DIR_PATH + "sp_gpt2.bin", 0, 0, 0, 0, true}, + + {3, "Grammarly-lg", "", PATHS.ALP_DATASET_BINARY_DIR_PATH + "sp_grammarly_coedit_lg.bin", 0, 0, 0, 0, true}, + + {4, "WAV2VEC", "", PATHS.ALP_DATASET_BINARY_DIR_PATH + "sp_wav2vec2_base_960h.bin", 0, 0, 0, 0, true}, + +}}; +} // namespace alp_bench +#endif \ No newline at end of file diff --git a/data/include/generated_columns.hpp b/data/include/generated_columns.hpp new file mode 100644 index 0000000..40fd2ec --- /dev/null +++ b/data/include/generated_columns.hpp @@ -0,0 +1,80 @@ +/* +-- DATE : 19/04/2024 +-- FILE_PATH : data/include/generated_columns.hpp +-- PROJECT_NAME : ALP +*/ + +#ifndef DATA_GENERATED_COLUMNS_HPP +#define DATA_GENERATED_COLUMNS_HPP + +#include "column.hpp" + +namespace alp_bench { +inline std::array generated_cols = { + {{0, "bw0", PATHS.GENERATED_COLUMNS_CSV_PATH + "generated_doubles_bw0.csv", "", 0, 0, 0, 0}, + {1, "bw1", PATHS.GENERATED_COLUMNS_CSV_PATH + "generated_doubles_bw1.csv", "", 0, 0, 0, 1}, + {2, "bw2", PATHS.GENERATED_COLUMNS_CSV_PATH + "generated_doubles_bw2.csv", "", 0, 0, 0, 2}, + {3, "bw3", PATHS.GENERATED_COLUMNS_CSV_PATH + "generated_doubles_bw3.csv", "", 0, 0, 0, 3}, + {4, "bw4", PATHS.GENERATED_COLUMNS_CSV_PATH + "generated_doubles_bw4.csv", "", 0, 0, 0, 4}, + {5, "bw5", PATHS.GENERATED_COLUMNS_CSV_PATH + "generated_doubles_bw5.csv", "", 0, 0, 0, 5}, + {6, "bw6", PATHS.GENERATED_COLUMNS_CSV_PATH + "generated_doubles_bw6.csv", "", 0, 0, 0, 6}, + {7, "bw7", PATHS.GENERATED_COLUMNS_CSV_PATH + "generated_doubles_bw7.csv", "", 0, 0, 0, 7}, + {8, "bw8", PATHS.GENERATED_COLUMNS_CSV_PATH + "generated_doubles_bw8.csv", "", 0, 0, 0, 8}, + {9, "bw9", PATHS.GENERATED_COLUMNS_CSV_PATH + "generated_doubles_bw9.csv", "", 0, 0, 0, 9}, + {10, "bw10", PATHS.GENERATED_COLUMNS_CSV_PATH + "generated_doubles_bw10.csv", "", 0, 0, 0, 10}, + {11, "bw11", PATHS.GENERATED_COLUMNS_CSV_PATH + "generated_doubles_bw11.csv", "", 0, 0, 0, 11}, + {12, "bw12", PATHS.GENERATED_COLUMNS_CSV_PATH + "generated_doubles_bw12.csv", "", 0, 0, 0, 12}, + {13, "bw13", PATHS.GENERATED_COLUMNS_CSV_PATH + "generated_doubles_bw13.csv", "", 0, 0, 0, 13}, + {14, "bw14", PATHS.GENERATED_COLUMNS_CSV_PATH + "generated_doubles_bw14.csv", "", 0, 0, 0, 14}, + {15, "bw15", PATHS.GENERATED_COLUMNS_CSV_PATH + "generated_doubles_bw15.csv", "", 0, 0, 0, 15}, + {16, "bw16", PATHS.GENERATED_COLUMNS_CSV_PATH + "generated_doubles_bw16.csv", "", 0, 0, 0, 16}, + {17, "bw17", PATHS.GENERATED_COLUMNS_CSV_PATH + "generated_doubles_bw17.csv", "", 0, 0, 0, 17}, + {18, "bw18", PATHS.GENERATED_COLUMNS_CSV_PATH + "generated_doubles_bw18.csv", "", 0, 0, 0, 18}, + {19, "bw19", PATHS.GENERATED_COLUMNS_CSV_PATH + "generated_doubles_bw19.csv", "", 0, 0, 0, 19}, + {20, "bw20", PATHS.GENERATED_COLUMNS_CSV_PATH + "generated_doubles_bw20.csv", "", 0, 0, 0, 20}, + {21, "bw21", PATHS.GENERATED_COLUMNS_CSV_PATH + "generated_doubles_bw21.csv", "", 0, 0, 0, 21}, + {22, "bw22", PATHS.GENERATED_COLUMNS_CSV_PATH + "generated_doubles_bw22.csv", "", 0, 0, 0, 22}, + {23, "bw23", PATHS.GENERATED_COLUMNS_CSV_PATH + "generated_doubles_bw23.csv", "", 0, 0, 0, 23}, + {24, "bw24", PATHS.GENERATED_COLUMNS_CSV_PATH + "generated_doubles_bw24.csv", "", 0, 0, 0, 24}, + {25, "bw25", PATHS.GENERATED_COLUMNS_CSV_PATH + "generated_doubles_bw25.csv", "", 0, 0, 0, 25}, + {26, "bw26", PATHS.GENERATED_COLUMNS_CSV_PATH + "generated_doubles_bw26.csv", "", 0, 0, 0, 26}, + {27, "bw27", PATHS.GENERATED_COLUMNS_CSV_PATH + "generated_doubles_bw27.csv", "", 0, 0, 0, 27}, + {28, "bw28", PATHS.GENERATED_COLUMNS_CSV_PATH + "generated_doubles_bw28.csv", "", 0, 0, 0, 28}, + {29, "bw29", PATHS.GENERATED_COLUMNS_CSV_PATH + "generated_doubles_bw29.csv", "", 0, 0, 0, 29}, + {30, "bw30", PATHS.GENERATED_COLUMNS_CSV_PATH + "generated_doubles_bw30.csv", "", 0, 0, 0, 30}, + {31, "bw31", PATHS.GENERATED_COLUMNS_CSV_PATH + "generated_doubles_bw31.csv", "", 0, 0, 0, 31}, + {32, "bw32", PATHS.GENERATED_COLUMNS_CSV_PATH + "generated_doubles_bw32.csv", "", 0, 0, 0, 32}, + {33, "bw33", PATHS.GENERATED_COLUMNS_CSV_PATH + "generated_doubles_bw33.csv", "", 0, 0, 0, 33}, + {34, "bw34", PATHS.GENERATED_COLUMNS_CSV_PATH + "generated_doubles_bw34.csv", "", 0, 0, 0, 34}, + {35, "bw35", PATHS.GENERATED_COLUMNS_CSV_PATH + "generated_doubles_bw35.csv", "", 0, 0, 0, 35}, + {36, "bw36", PATHS.GENERATED_COLUMNS_CSV_PATH + "generated_doubles_bw36.csv", "", 0, 0, 0, 36}, + {37, "bw37", PATHS.GENERATED_COLUMNS_CSV_PATH + "generated_doubles_bw37.csv", "", 0, 0, 0, 37}, + {38, "bw38", PATHS.GENERATED_COLUMNS_CSV_PATH + "generated_doubles_bw38.csv", "", 0, 0, 0, 38}, + {39, "bw39", PATHS.GENERATED_COLUMNS_CSV_PATH + "generated_doubles_bw39.csv", "", 0, 0, 0, 39}, + {40, "bw40", PATHS.GENERATED_COLUMNS_CSV_PATH + "generated_doubles_bw40.csv", "", 0, 0, 0, 40}, + {41, "bw41", PATHS.GENERATED_COLUMNS_CSV_PATH + "generated_doubles_bw41.csv", "", 0, 0, 0, 41}, + {42, "bw42", PATHS.GENERATED_COLUMNS_CSV_PATH + "generated_doubles_bw42.csv", "", 0, 0, 0, 42}, + {43, "bw43", PATHS.GENERATED_COLUMNS_CSV_PATH + "generated_doubles_bw43.csv", "", 0, 0, 0, 43}, + {44, "bw44", PATHS.GENERATED_COLUMNS_CSV_PATH + "generated_doubles_bw44.csv", "", 0, 0, 0, 44}, + {45, "bw45", PATHS.GENERATED_COLUMNS_CSV_PATH + "generated_doubles_bw45.csv", "", 0, 0, 0, 45}, + {46, "bw46", PATHS.GENERATED_COLUMNS_CSV_PATH + "generated_doubles_bw46.csv", "", 0, 0, 0, 46}, + {47, "bw47", PATHS.GENERATED_COLUMNS_CSV_PATH + "generated_doubles_bw47.csv", "", 0, 0, 0, 47}, + {48, "bw48", PATHS.GENERATED_COLUMNS_CSV_PATH + "generated_doubles_bw48.csv", "", 0, 0, 0, 48}, + {49, "bw49", PATHS.GENERATED_COLUMNS_CSV_PATH + "generated_doubles_bw49.csv", "", 0, 0, 0, 49}, + {50, "bw50", PATHS.GENERATED_COLUMNS_CSV_PATH + "generated_doubles_bw50.csv", "", 0, 0, 0, 50}, + {51, "bw51", PATHS.GENERATED_COLUMNS_CSV_PATH + "generated_doubles_bw51.csv", "", 0, 0, 0, 51}, + {52, "bw52", PATHS.GENERATED_COLUMNS_CSV_PATH + "generated_doubles_bw52.csv", "", 0, 0, 0, 52}, + {53, "bw53", PATHS.GENERATED_COLUMNS_CSV_PATH + "generated_doubles_bw53.csv", "", 0, 0, 0, 53}, + {54, "bw54", PATHS.GENERATED_COLUMNS_CSV_PATH + "generated_doubles_bw54.csv", "", 0, 0, 0, 54}, + {55, "bw55", PATHS.GENERATED_COLUMNS_CSV_PATH + "generated_doubles_bw55.csv", "", 0, 0, 0, 55}, + {56, "bw56", PATHS.GENERATED_COLUMNS_CSV_PATH + "generated_doubles_bw56.csv", "", 0, 0, 0, 56}, + {57, "bw57", PATHS.GENERATED_COLUMNS_CSV_PATH + "generated_doubles_bw57.csv", "", 0, 0, 0, 57}, + {58, "bw58", PATHS.GENERATED_COLUMNS_CSV_PATH + "generated_doubles_bw58.csv", "", 0, 0, 0, 58}, + {59, "bw59", PATHS.GENERATED_COLUMNS_CSV_PATH + "generated_doubles_bw59.csv", "", 0, 0, 0, 59}, + {60, "bw60", PATHS.GENERATED_COLUMNS_CSV_PATH + "generated_doubles_bw60.csv", "", 0, 0, 0, 60}, + {61, "bw61", PATHS.GENERATED_COLUMNS_CSV_PATH + "generated_doubles_bw61.csv", "", 0, 0, 0, 61}, + {62, "bw62", PATHS.GENERATED_COLUMNS_CSV_PATH + "generated_doubles_bw62.csv", "", 0, 0, 0, 62}, + {63, "bw63", PATHS.GENERATED_COLUMNS_CSV_PATH + "generated_doubles_bw63.csv", "", 0, 0, 0, 63}, + {64, "bw64", PATHS.GENERATED_COLUMNS_CSV_PATH + "generated_doubles_bw64.csv", "", 0, 0, 0, 64}}}; +} +#endif // DATA_GENERATED_COLUMNS_HPP diff --git a/data/samples/air_sensor_f.csv b/data/samples/air_sensor_f.csv new file mode 100644 index 0000000..06f0302 --- /dev/null +++ b/data/samples/air_sensor_f.csv @@ -0,0 +1,1024 @@ +0.760596223549682 +0.766602358530133 +0.7640825065497033 +0.78199319534235 +0.7928825265604798 +0.8122623298163423 +0.8094212452911653 +0.8008702297987237 +0.7831979616620974 +0.7843400666085314 +0.7835688730309645 +0.7726603787062662 +0.7802838243162683 +0.7896994471683727 +0.8030315978849254 +0.8045825619982389 +0.7991935210671951 +0.79924502331348 +0.8159758383555016 +0.7967972189307905 +0.8126594801508729 +0.8216302008892404 +0.8351956570416047 +0.843730522009409 +0.8303822633046659 +0.8221109517379086 +0.8420872815332069 +0.8248203992082053 +0.842940315059234 +0.8351933833105395 +0.8398798142335242 +0.8515877081100611 +0.8476226653706567 +0.8416666338087044 +0.8223478343843011 +0.8286867407907272 +0.8301496219481592 +0.8169189631234933 +0.8105421753155602 +0.801792753080239 +0.8043345402179564 +0.8124926201586882 +0.8038638392106682 +0.7856344190084167 +0.804441592414797 +0.795488943145632 +0.8033919851086053 +0.8062591536449222 +0.8198153920125417 +0.8098442935326289 +0.8204512766548021 +0.8145428002785758 +0.8071815097966621 +0.8035493782035813 +0.8096074377114235 +0.8156188182261642 +0.83161977941169 +0.8248902819659495 +0.8414473984195479 +0.8556391951093216 +0.8446139051109504 +0.8408265672055486 +0.8339480320379975 +0.821003418916803 +0.8026396278420062 +0.8007178957816717 +0.8103347668782861 +0.8050794380413179 +0.8230590868276375 +0.8123855177873699 +0.8285216124498246 +0.8203525583263509 +0.8049220974083486 +0.8060098485783298 +0.7903185994027981 +0.8067276705496883 +0.8207919559951856 +0.8299863708453672 +0.8161568017464413 +0.7962671468500488 +0.8153374538502538 +0.8101521278342152 +0.8248092978736342 +0.8404310566647136 +0.8229725353102373 +0.8360148510877041 +0.8490740379593104 +0.8469512534018492 +0.8540280526243286 +0.8471499605655102 +0.8584746412166915 +0.846969076391422 +0.8588156493043172 +0.8661369863108049 +0.874995314138454 +0.8871942462934327 +0.8713375300009402 +0.8622404415596673 +0.8773588947710955 +0.8588682302247906 +0.872220391584288 +0.8835413257909437 +0.880021282209051 +0.8604405694980298 +0.8408696886040931 +0.8469233273711261 +0.8523097502769009 +0.855734391751872 +0.856566563699794 +0.8625633649937857 +0.8680916739546118 +0.8723054130749694 +0.8833807322553121 +0.8866971360659431 +0.9008783266759764 +0.8962262934051264 +0.880595421674202 +0.8725080144338396 +0.4999938546448889 +0.5024894210450148 +0.520412174253362 +0.5129238614640343 +0.5187822661530462 +0.5157030286983335 +0.5302581407445767 +0.534545680626847 +0.5372354097223764 +0.5200592250925515 +0.5189068694585002 +0.533181581876604 +0.5242409082096071 +0.5289407539118929 +0.5179901768182025 +0.531990096969868 +0.5285045183080563 +0.5247715009916786 +0.5323769898068057 +0.5200127393968633 +0.5197611097885337 +0.5296897520674242 +0.5488625172415836 +0.5612388643830566 +0.5669507090854866 +0.5705741304364356 +0.5669057518556646 +0.5609618347061459 +0.5631064900907923 +0.5598386946845729 +0.5557494236688348 +0.5451149056234545 +0.555764175097852 +0.5447580830743364 +0.5518644281414663 +0.5542123917671868 +0.5671841442673093 +0.5798169393372595 +0.5869439514994824 +0.5914847719560744 +0.6104290798331619 +0.6096456170918295 +0.6128254639824731 +0.6112856233541611 +0.5938481291858906 +0.5995078557985651 +0.5852442113684319 +0.5959593108304754 +0.5934513212080343 +0.6067899226364287 +0.6118938541169446 +0.6028591539570175 +0.58842626809168 +0.58156338298191 +0.5739662450029811 +0.5786321109997684 +0.559495291072754 +0.5615775815942274 +0.5486527585684695 +0.5601427275190617 +0.5467977131440417 +0.5427414768682487 +0.5259067992779505 +0.5070405819846109 +0.5047252012697138 +0.4944311745042022 +0.5100518254544081 +0.5252567065353136 +0.5417920879897526 +0.5601574953203691 +0.5792750391507774 +0.576530656770128 +0.5922806008905744 +0.575314211926126 +0.5915390357504278 +0.6027266527808627 +0.5829431262169079 +0.584901955006808 +0.5766742877802291 +0.5817001164089748 +0.5978214671744458 +0.5973555773203488 +0.5984436927757544 +0.5956120995509063 +0.5763744405271791 +0.5905462510470734 +0.6018626719844098 +0.5998852097703411 +0.6046903405367449 +0.6064156567227686 +0.6008451613194512 +0.6060054000746353 +0.6051073168594459 +0.6076740871618004 +0.5996681216327988 +0.6114037164804103 +0.6288145761962023 +0.6364510027200799 +0.6265814677114494 +0.6104573577046071 +0.6135293779763525 +0.6239356362903405 +0.6066696333851135 +0.6050253089361528 +0.6024809075914602 +0.6169325551267038 +0.6017486898872856 +0.5988546019819667 +0.5978556927632056 +0.5823574916623097 +0.5860992210305832 +0.5781210412339328 +0.5612703510204682 +0.5579521301161132 +0.5585014127813664 +0.5582566546232407 +0.5440782354100189 +0.5564770776312351 +0.5636976752502926 +0.5692667505962772 +0.5670287388173271 +0.5699163196432054 +0.5563346590253221 +0.5530296505234741 +0.5509271630094845 +0.5537262377498807 +0.5359280262460006 +0.5315816567504704 +0.5510030038952807 +0.5356541555616157 +0.5429804228612938 +0.5370855061985225 +0.5483489394273516 +0.5558765588477678 +0.5421125444729837 +0.5608958316501264 +0.5781524734283543 +0.5869507465383251 +0.5868335332063515 +0.5674760829375383 +0.5858038066491115 +0.6001203686936315 +0.6086749464717224 +0.6231015977141547 +0.6187501021000271 +0.6101929437346216 +0.5960315711090605 +0.5786102149112156 +0.5630007385745243 +0.5536504965644768 +0.5497409748551667 +0.5487750444089777 +0.5468681449571123 +0.5295212214681702 +0.5162075949843263 +0.5102823459354259 +0.5127145835449038 +0.5289956070649382 +0.5189687895880203 +0.5175218410727076 +0.5372880658582462 +0.5265059140336633 +0.5161143987191487 +0.50724416459567 +0.5224370033317989 +0.5105828695662274 +0.5292493798587238 +0.5441741434700273 +0.5333158845537905 +0.5224758896949383 +0.5140423266615556 +0.5173643027019377 +0.5253943226262366 +0.5382900945824382 +0.536963383703713 +0.5246300425782433 +0.5061635870036919 +0.5054812245802804 +0.5241446760873661 +0.5299240068566151 +0.517355146104284 +0.5371675228367506 +0.5456265055728163 +0.5591774084721199 +0.5553387665735771 +0.5547486745235248 +0.554983776806673 +0.5517507013306903 +0.5455092878306851 +0.5300369722560901 +0.5178108363647566 +0.5073000788352001 +0.5264123443482616 +0.5449548766341886 +0.542867370623469 +0.5392024646271171 +0.5350146031028247 +0.5488533645901572 +0.5316736414816878 +0.5298892188738946 +0.5147772999274532 +0.5059662484584041 +0.49121273329201925 +0.48179838418604826 +0.4750380067379492 +0.4642300367974236 +0.45918864955542044 +0.45041588541469757 +0.4482363889978767 +0.4657563307129271 +0.4795729282807186 +0.4807652716293525 +0.4812218205952023 +0.491448980175473 +0.4928781030273596 +0.5071133049013895 +0.5214295934505001 +0.5238944114784613 +0.538402751968646 +0.5294025262265273 +0.5311291619827294 +0.529638148729965 +0.54001511878127 +0.5574468891961026 +0.5702487192191935 +0.5710955092280392 +0.5761003534419021 +0.5891016561819917 +0.6014329525728936 +0.614061709417923 +0.607237488684452 +0.6176725922415656 +0.6088664233970136 +0.6121833174944766 +0.6193403326786847 +0.5999739393367217 +0.6188205312977294 +0.5989618200142361 +0.6074880724536522 +0.5975094468631901 +0.5964545592719915 +0.5882981360995148 +0.5865466287137968 +0.5943233457363003 +0.584109028646711 +0.5828087208114313 +0.5938493401872643 +0.6056672415637558 +0.5942627369081289 +0.6117316720948686 +0.6216537965474164 +0.632854797469186 +0.6381106324504487 +0.6325993654647158 +0.6515910722118338 +0.6548495558666256 +0.6506436641905495 +0.653093091920997 +0.6431872087330681 +0.6565587818790998 +0.6406358822486655 +0.6537889089373272 +0.6429334640809158 +0.6231122890225229 +0.6298584483225868 +0.6230853245475706 +0.6350668557414615 +0.6152917721323171 +0.6289650302670283 +0.6405975643478767 +0.6488415030991111 +0.637233463288462 +0.6240093772267868 +0.6097761654003336 +0.6279424413167168 +0.6113062551962712 +0.6300934318143754 +0.637013950509673 +0.6215891978687739 +0.6308703311007892 +0.632116179906666 +0.634261095196567 +0.6289085212375597 +0.6167178320873917 +0.6058750635728702 +0.6087637233315291 +0.6154826783521755 +0.6276616453637952 +0.6304707098931327 +0.644113861070728 +0.6272829515386531 +0.638315567495626 +0.6527999403992061 +0.6414615321857124 +0.6215101435775435 +0.6397631178318492 +0.6217210050464058 +0.6281151448201957 +0.6305290318731858 +0.6298648956329442 +0.6395519244602473 +0.6425708541152199 +0.6417589060182287 +0.6394957737160353 +0.6261078006943126 +0.6130631526494271 +0.6025030520854213 +0.5940305627133907 +0.5807020425044082 +0.5922476419773677 +0.5731863082497765 +0.5540977848681896 +0.5690029032894616 +0.5741910818129956 +0.55815450141679 +0.5484567149685075 +0.5599902465066164 +0.5746471934846527 +0.5774617816111662 +0.5869664375318111 +0.5847754796273307 +0.585268661764082 +0.5683580977653964 +0.5781673705903584 +0.5913460508546102 +0.5778598057297697 +0.5947238587275028 +0.5852125907586583 +0.566427853696549 +0.5724471856622371 +0.573804348176559 +0.586408184678761 +0.6028651018527199 +0.6039284613089615 +0.6067275500189644 +0.6074059916497329 +0.6158946643580976 +0.6215307242227862 +0.6354932498641225 +0.6476957053923329 +0.6466720492388488 +0.6314884839482291 +0.6439952941614122 +0.6383914499785526 +0.6189221086934591 +0.6329981882658218 +0.6473184506613167 +0.6472125828652713 +0.6440526928835847 +0.6372872680685823 +0.6217545645291659 +0.6127855449592131 +0.6278968501143749 +0.6244726162466038 +0.6172548469402267 +0.6203431246591845 +0.6236202949887267 +0.6108284485245288 +0.6211004135549446 +0.622477211547482 +0.6109366234679457 +0.41063537221505036 +0.3974961147288166 +0.412858019255717 +0.4292192374071801 +0.424578663884567 +0.44036944947373724 +0.45981607618804154 +0.4770526773246108 +0.49241676254524264 +0.4995576108726206 +0.4875941566886386 +0.4977831599383976 +0.4943815619346647 +0.4979526934534321 +0.5129695386899314 +0.4964658171934647 +0.5094103671778297 +0.512784265601149 +0.506603462259903 +0.5182746604905732 +0.516753163838992 +0.5099761641253293 +0.5000404125517424 +0.4822135573034061 +0.4815620820765379 +0.48587598559035206 +0.5051225686248995 +0.5033774405166588 +0.5163296158620064 +0.5087020144447141 +0.522443236140487 +0.5176932155722934 +0.5054973701966863 +0.5090415193097593 +0.5238908382555592 +0.5073361981593785 +0.5227770928397736 +0.5106537456134632 +0.5034569914816115 +0.5207561221314796 +0.5259456776489397 +0.5340646024415018 +0.5401479365974775 +0.5315942508360896 +0.5398754564665398 +0.5265392895166504 +0.520484876710902 +0.5312337647177308 +0.5434928513237082 +0.5551019344368918 +0.5623742959038002 +0.5619204965668605 +0.5619561050917751 +0.5497600418266699 +0.5594407829446475 +0.5448202030795466 +0.558635277616349 +0.5464362583921367 +0.5537976050282273 +0.5603612118985605 +0.570456571937823 +0.5808565587536978 +0.5858602697367655 +0.6007063926729674 +0.6158924529436132 +0.6287413567198464 +0.620724510067743 +0.6131815122692309 +0.6170431457800171 +0.6168859794695166 +0.6244180364671449 +0.6065404099722708 +0.6126369381552462 +0.6224571822003393 +0.6359961069413606 +0.633155786561137 +0.6260432197435613 +0.6450884340331721 +0.6415819011580778 +0.6397327569546447 +0.6570916506198422 +0.6432570060867397 +0.6428979624969106 +0.6369255254944255 +0.647155555849083 +0.6290981558659505 +0.640758076294878 +0.6393347513502563 +0.6401593452961637 +0.655108286299179 +0.6551066760842723 +0.6526466016623658 +0.6422060480907932 +0.6556827631718534 +0.6636197616376338 +0.6801095267128718 +0.6733805730070149 +0.6629880794273978 +0.6664867702984275 +0.6621658535037741 +0.6807622497570947 +0.674043005041753 +0.6664451153230356 +0.6563852326545953 +0.6682842394048727 +0.6868432130383539 +0.6812021619208035 +0.6945513683418638 +0.7003688988397977 +0.7046559022880513 +0.6953799876679034 +0.7026411449070213 +0.7144031374767156 +0.719885836966855 +0.7096622438584342 +0.7051538532202959 +0.7107424144967228 +0.7140542529116742 +0.7004720017393127 +0.7125899505242851 +0.7018215662526975 +0.6854835829815773 +0.6660158125227015 +0.6506232312034108 +0.6493064287638302 +0.6317837003812732 +0.650032730841602 +0.6622335290628566 +0.645826372002037 +0.6275484051332199 +0.6421296646447987 +0.628481376558419 +0.6276862393149056 +0.6456383025498655 +0.6411049469861282 +0.6267508421415698 +0.6147740281509916 +0.6238878266689776 +0.603924743615559 +0.6193967965118538 +0.6084069665066889 +0.6177697762186294 +0.6215001036943928 +0.637092326204217 +0.6270506231870095 +0.6439216093709048 +0.6245498341203678 +0.6144955757085485 +0.6049740532702219 +0.5960164509969065 +0.6002912719231542 +0.6144216213169464 +0.6065265613410061 +0.5936837303368314 +0.5950951750703545 +0.5861349030988042 +0.5959270833555053 +0.6059711511131858 +0.5996371165553891 +0.6033935979369253 +0.6091577561334655 +0.6170370839969327 +0.6182683127187335 +0.6263444633779094 +0.6108375070420323 +0.6051189326619026 +0.6166871401258954 +0.6029446015038844 +0.6034765034094857 +0.6159198843927614 +0.6345646026377558 +0.6392190093305067 +0.6226391083604829 +0.6355170844659962 +0.6321450004892045 +0.61410613722032 +0.5966140704726062 +0.5881805197640143 +0.5790427939811162 +0.5816622994427973 +0.5638987286729223 +0.5731684076421699 +0.5677929677431056 +0.5502909966291122 +0.5347116554117296 +0.5264185956398336 +0.5204009644162955 +0.5130762990221076 +0.4968273056140349 +0.5025612094439154 +0.5170877014975039 +0.49782514175224446 +0.5164234631058039 +0.4972162396936373 +0.47738543255850174 +0.4916996818880089 +0.5089903869022748 +0.5131433310505464 +0.49609114313617186 +0.49938769109221187 +0.5186802240699877 +0.5294417664096818 +0.5148459072442498 +0.5238483549576363 +0.507522338288661 +0.5130497578496823 +0.5248724424257851 +0.512569979445355 +0.5319133535188726 +0.5246554142128873 +0.5426801217781835 +0.556931382935315 +0.5413507811810124 +0.5464682314990276 +0.5267734416056034 +0.5219401248964328 +0.537402137187969 +0.5447097885114731 +0.5560427149331475 +0.5600546378210917 +0.5720968638381769 +0.5545445898936116 +0.554482423780575 +0.5565460423155311 +0.558356093482319 +0.5473464369341711 +0.5614967197002002 +0.557376652683151 +0.5750510868709608 +0.556282841035489 +0.5669124304729682 +0.5822887275109961 +0.6002508275000514 +0.5853766119079986 +0.5850046740995726 +0.5706197300878417 +0.5803048593684877 +0.5691429639498787 +0.5857905294976402 +0.5784073444867271 +0.5621683133712366 +0.5549417500765874 +0.5673749881349949 +0.5642875453246786 +0.5755169970321009 +0.5624622313333615 +0.5752474137104698 +0.5903331559907721 +0.5960632826389751 +0.5929774590613273 +0.591206360434652 +0.5922793192396529 +0.5773973695643732 +0.5606871428511194 +0.5785654862723432 +0.594816784570594 +0.5771082760642137 +0.5752621852186158 +0.5904266720073594 +0.5832050993370799 +0.5801780238618989 +0.5631980430657483 +0.5496416788276037 +0.5561426352728863 +0.552272094591304 +0.55409890796759 +0.5480198902995336 +0.5528845197234473 +0.5498370319079333 +0.5333403354235874 +0.5336529494649598 +0.5142348700294709 +0.51919388581257 +0.5020443427712853 +0.512644473742938 +0.5134540855763073 +0.5170759220830976 +0.5002582082474484 +0.49855277590086294 +0.5093661544380926 +0.5082320864408395 +0.5228303626961759 +0.516220115957779 +0.5094911145026811 +0.5039221843176488 +0.5047772510784069 +0.4856378706241141 +0.5020919947565967 +0.518671865845135 +0.504436799438619 +0.5201701268273307 +0.5358021250090625 +0.5501243774265229 +0.5444438733852668 +0.5516589525102129 +0.5431996019812985 +0.5298732186853283 +0.5262231627057403 +0.521160393216806 +0.5106574676134621 +0.5232595808655953 +0.5137101173588339 +0.5123615867372627 +0.5313024872271271 +0.529950558695378 +0.5325276559444082 +0.5256929955020725 +0.5386267599293195 +0.5284870758617988 +0.5400574012259255 +0.5408994775593882 +0.5462742222569752 +0.5297388701436668 +0.5260513018217388 +0.5247759347356816 +0.5438624958001997 +0.5599646049589646 +0.5429271650231593 +0.5411874559174923 +0.5495955291998658 +0.5368988561759146 +0.5516872313340746 +0.5359566213723015 +0.524953191797476 +0.5402934732868587 +0.5460013797131191 +0.531028174539856 +0.5279110329873022 +0.5454726097033409 +0.5386305644064712 +0.5310036733384597 +0.5281535785880698 +0.5436781562471453 +0.5398096396850597 +0.5368107519584234 +0.5350479132694101 +0.5491207979981606 +0.5394165838463886 +0.5566894257312631 +0.5749506017893116 +0.5672320470200398 +0.5688732350251141 +0.584532148360923 +0.5875226581596349 +0.5942245554574879 +0.5864858841600062 +0.5945399501316515 +0.5747329850493538 +0.5705322531976976 +0.5783138855285924 +0.5690146724761458 +0.5639106583738992 +0.5798217062599719 +0.57542227506842 +0.5877150695994812 +0.5968559490393182 +0.5853987415619917 +0.5895943058855329 +0.5922090476060551 +0.6035302264763103 +0.6047370366797206 +35.06998718479175 +35.026510405496204 +35.052194109699144 +35.02678091350467 +34.97817093704823 +34.95358447933235 +34.94734853877246 +34.99424815718199 +35.00118424635222 +34.98014339492122 +35.0139163105153 +35.042603866808626 +35.05672731198009 +35.04602955124461 +35.05141510117176 +35.09631745104982 +35.11995667661152 +35.13436888764459 +35.143486141656496 +35.14944467051308 +35.128403024884946 +35.086292193235764 +35.10745692589911 +35.11432938089309 +35.0724472880697 +35.08937850327463 +35.0820301360706 +35.122243573572455 +35.14108976705901 +35.09372066337333 +35.087338207017346 +35.04438922764075 +35.02274704351056 +35.010260878377935 +34.99544246979852 +34.96368424430957 +34.959783529148936 +34.92913914222301 +34.95884585399919 +34.961669705713916 +34.968613723565895 +35.012329342196594 +34.992882684803746 +34.95000814962301 +34.93262584092513 +34.93576058375312 +34.90374320812031 +34.93897328578008 +34.95984550591673 +34.96161085458785 +34.91808022365145 +34.914458415968696 +34.93247169131533 +34.950378208955826 +34.992823685929324 +35.03547201030871 +35.01840057665719 +34.98952002427887 +34.994169130008835 +34.9518472710744 +34.95984526005336 +34.936790359609944 +34.95698789767991 +34.94695045939719 +34.92618848344812 +34.920976665853495 +34.89244942145761 +34.90890046794444 +34.912851269681354 +34.866995804327104 +34.876731484989584 +34.87156455294957 +34.852699335779434 +34.8168487844343 +34.81059341929376 +34.79727464354893 +34.772673879711434 +34.72367709799322 +34.764228402443464 +34.748290948061985 +34.77529855291847 +34.80384256266734 +34.760541296946634 +34.75437598850247 +34.72486600890642 +34.70337856375387 +34.71408280395347 +34.68629143405548 +34.64670072725053 +34.659458665380114 +34.69585183894684 +34.703188882022715 +34.72395709250132 +34.72857460728405 +34.69782580561042 +34.74152536579461 +34.71082160226905 +34.726955828445384 +34.683580576556515 +34.71441126025169 +34.71800930886349 +34.73930463371133 +34.6914963755514 +34.687611102317256 +34.65769373870956 +34.64841617692879 +34.66854187766009 +34.66413120073309 +34.67231394861078 +34.67143841823385 +34.63814152527399 +34.60159271182473 +34.558574157570895 +34.52947169990303 +34.5190296272953 +34.56822139118722 +34.55186888514822 +34.56016153442625 +34.52765657178888 +34.54731472608095 +34.550995568068466 +34.54594399886894 +34.57188183481607 +34.523684281588594 +34.523778940147814 +34.51034537244077 +34.558143538707014 +34.56780158013292 +34.55123138314778 +34.554485320608066 +34.57491267660337 +34.58874058564048 +34.55933222016745 +34.560718858968364 +34.52120949085528 +34.51738255576811 +34.48300544337011 +34.449734097563514 +34.409460802716225 +34.38582182133236 +34.39388831340253 +34.36796471775285 +34.39305099937841 +34.41220463206868 +34.369973498989324 +34.34891149784595 +34.353297842368136 +34.35564177243928 +34.39966739469088 +34.368212059122506 +34.35758176692047 +34.310306363556215 +34.29043982365878 +34.32435242757217 +34.35838419025895 +34.358142539592066 +34.400438755828375 +34.436630938853654 +34.482397476957935 +34.50512818623728 +34.51870846021014 +34.5024978024086 +34.48099725153128 +34.48410386267941 +34.49940410446298 +34.49517949750473 +34.50585462564788 +34.531134512150096 +34.56378010853567 +34.54583432273485 +34.56254588140798 +34.55358613829957 +34.517301935740186 +34.48047025462397 +34.46304406822477 +34.465969475478964 +34.483935211961565 +34.50886629432271 +34.50419793393501 +34.45998540043674 +34.48335770666437 +34.45335834418532 +34.4679850154969 +34.4522041256481 diff --git a/data/samples/arade4.csv b/data/samples/arade4.csv new file mode 100644 index 0000000..c9dffb2 --- /dev/null +++ b/data/samples/arade4.csv @@ -0,0 +1,1024 @@ +732.3785 +845.3342 +858.3417 +755.6418 +675.457 +672.1107 +644.3852 +401.001 +186.2407 +1037.404 +1063.291 +1049.064 +1131.187 +1191.762 +1056.15 +970.3967 +1026.888 +1110.171 +1133.783 +1080.276 +1118.605 +1125.179 +1323.143 +1304.983 +941.3412 +929.297 +1506.525 +1501.489 +1474.977 +1495.038 +992.0778 +1028.913 +1239.721 +1464.465 +1578.045 +1620.839 +1593.963 +1520.11 +1634.681 +1655.531 +1543.417 +1141.074 +1607.841 +1571.348 +1667.427 +1637.854 +1563.714 +1295.363 +1181.324 +1297.879 +1305.053 +1225.809 +1019.426 +1159.604 +1191.434 +1100.257 +932.8579 +1106.239 +1344.119 +997.5588 +1016.11 +1077.239 +977.6118 +1215.334 +1270.191 +1147.629 +868.0411 +926.7105 +1013.842 +971.7899 +838.1848 +816.8995 +825.5671 +814.501 +738.3611 +646.2198 +650.558 +723.5876 +423.6357 +232.501 +1054.841 +1183.133 +1173.675 +1292.36 +1140.416 +874.6097 +1323.982 +1341.297 +1115.82 +1224.287 +1051.634 +1279.224 +1447.6 +1376.571 +1329.657 +1137.472 +1177.543 +1371.483 +1393.847 +1526.447 +1399.292 +1110.696 +1331.904 +1451.44 +1647.891 +1499.535 +1528.504 +1163.961 +1200.588 +1564.817 +1602.904 +1651.332 +1034.201 +1663.603 +1681.708 +1584.482 +1566.054 +1510.852 +1181.427 +1387.732 +1110.195 +1203.747 +1145.346 +830.6447 +1158.613 +1108.534 +1110.594 +1104.068 +1303.308 +1331.561 +1037.57 +967.1426 +1089.525 +1147.874 +1441.076 +1413.648 +996.014 +1088.591 +1065.356 +1090.49 +1037.387 +894.1847 +924.015 +850.7042 +796.8025 +689.1548 +602.9915 +655.5393 +689.8882 +331.7519 +353.9727 +973.1526 +1199.177 +1090.18 +1030.272 +1156.83 +745.2849 +1195.054 +1245.291 +1212.953 +1118.983 +899.8135 +1076.083 +1424.209 +1451.464 +1471.841 +1095.684 +1110.392 +1338.543 +1419.247 +1276.349 +1353.018 +925.2847 +1468.401 +1381.533 +1474.575 +1453.199 +1501.548 +1207.69 +1238.884 +1665.828 +1556.179 +1646.407 +938.3862 +1590.393 +1540.292 +1590.116 +1641.633 +1626.371 +1280.788 +1341.994 +1200.673 +1121.542 +1177.802 +961.803 +1017.455 +922.7045 +1005.58 +932.8448 +1133.809 +1153.397 +917.7402 +1196.105 +1089.843 +996.3677 +1185.23 +1542.577 +844.2268 +990.8832 +1028.3 +1109.72 +1003.484 +922.5016 +903.7413 +746.7487 +652.4911 +739.4032 +621.9377 +635.9507 +628.913 +366.401 +335.4576 +900.0152 +1169.224 +1169.68 +1320.766 +1260.548 +891.0255 +1298.155 +1148.531 +1064.667 +1059.031 +928.511 +1265.825 +1322.92 +1319.375 +1468.915 +1065.084 +982.3492 +1459.736 +1371.221 +1427.554 +1390.826 +974.805 +1328.121 +1309.32 +1368.233 +1379.184 +1384.117 +1166.76 +1110.864 +1487.958 +1607.134 +1581.453 +1087.11 +1512.121 +1419.005 +1615.421 +1550.217 +1441.483 +1205.145 +1324.357 +927.3223 +1092.645 +1039.703 +780.7592 +883.5186 +1060.683 +918.8723 +1206.093 +1439.342 +1450.397 +1228.6 +1100.457 +1282.196 +1216.993 +1510.927 +1444.196 +889.9295 +906.856 +907.8042 +888.212 +998.7509 +925.7321 +963.376 +848.2373 +803.4102 +801.386 +814.4825 +613.012 +750.5175 +258.401 +354.9615 +961.803 +1237.115 +634.8102 +441.7967 +571.2499 +578.8715 +596.825 +630.7361 +660.5275 +627.3935 +569.3618 +526.0161 +603.0272 +658.1208 +703.0035 +737.692 +638.4153 +629.1585 +150.523 +223.7998 +122.9913 +253.6262 +241.6067 +586.7182 +800.9073 +761.9966 +714.3868 +726.0057 +624.0403 +766.0666 +759.045 +741.3563 +722.1384 +636.02 +759.6865 +854.4985 +842.9883 +889.2831 +707.309 +661.9417 +758.9417 +841.4954 +727.6267 +595.7265 +668.6118 +733.5038 +765.7967 +849.1757 +724.6765 +764.7758 +801.2516 +1033.559 +946.9172 +995.1537 +660.594 +915.0672 +930.8843 +896.1736 +871.473 +898.6429 +900.2988 +751.2473 +427.8788 +593.606 +546.584 +638.3862 +562.3126 +480.0987 +618.994 +523.6407 +604.2366 +621.2305 +595.0482 +563.4566 +612.176 +665.1033 +567.3696 +692.0662 +665.5584 +599.8243 +652.3825 +253.2447 +222.86 +198.5428 +235.1875 +240.7155 +223.8705 +627.1378 +677.1097 +765.1118 +712.3815 +689.1708 +592.8278 +679.7278 +682.7546 +720.5715 +747.1733 +612.525 +645.173 +763.8227 +803.8781 +735.1042 +633.1362 +749.0434 +746.546 +685.8418 +639.7918 +681.9293 +725.93 +861.014 +901.8052 +878.379 +677.7958 +776.3749 +953.614 +994.267 +1080.068 +676.6946 +939.3116 +1005.724 +858.0468 +905.2966 +892.2255 +789.919 +838.3792 +546.2653 +619.3105 +559.3157 +732.6884 +554.8605 +552.636 +661.48 +712.2361 +659.5621 +661.2222 +631.5903 +572.7993 +573.2891 +660.8932 +651.8427 +730.7798 +761.5674 +675.457 +593.391 +172.2257 +167.9235 +197.834 +109.1193 +183.0483 +217.0818 +189.5295 +610.908 +833.4432 +736.9551 +727.7479 +681.3082 +649.1703 +794.5622 +746.1044 +761.2278 +729.2728 +675.7868 +755.3388 +807.208 +845.4898 +883.4288 +725.5253 +709.1202 +774.9331 +695.5651 +675.1207 +753.696 +753.5978 +736.7555 +798.1431 +808.7758 +677.1615 +762.9368 +979.2426 +916.887 +888.5577 +678.7317 +890.4379 +898.1069 +891.0758 +921.7758 +862.4248 +860.5367 +743.6232 +522.6958 +589.172 +449.8198 +748.6953 +461.564 +487.7315 +692.0839 +608.8308 +685.3729 +755.4164 +661.8698 +652.7033 +673.8945 +617.7094 +693.2595 +779.2142 +757.7484 +690.9409 +662.1519 +222.3963 +238.9885 +218.123 +245.2315 +204.9858 +296.641 +245.2708 +116.3304 +124.5545 +650.7516 +709.6602 +659.3932 +689.118 +672.6663 +544.2457 +768.3976 +706.3035 +625.0887 +669.9238 +670.9 +655.8167 +818.9706 +706.3383 +736.5985 +644.1085 +678.4131 +712.3727 +726.4888 +636.2413 +651.7247 +648.8529 +743.5657 +687.3239 +746.7178 +672.4888 +697.62 +799.7371 +817.3748 +862.63 +642.4127 +782.5515 +799.1025 +904.3937 +813.4545 +746.0791 +751.1625 +861.7843 +538.6037 +525.6556 +455.2384 +691.8785 +401.9681 +567.9531 +548.2085 +514.8563 +612.7105 +629.0495 +679.587 +519.5122 +591.4963 +587.3513 +673.1833 +660.2455 +668.6474 +640.0785 +674.6633 +126.5988 +164.686 +158.7382 +158.2632 +136.4462 +238.0313 +156.4825 +254.0658 +254.7052 +107.895 +591.1078 +611.6542 +563.2585 +615.9737 +524.3776 +520.7339 +630.6368 +666.6429 +610.0195 +619.8726 +541.7506 +621.8185 +730.0084 +1043.103 +1098.11 +1099.801 +1163.829 +980.692 +918.2997 +1108.177 +1202.882 +1261.619 +998.8406 +966.2704 +898.6957 +1014.828 +992.3214 +961.2776 +953.849 +937.6435 +1059.899 +1022.399 +1012.502 +681.3933 +1103.138 +771.878 +747.2466 +1153.249 +1140.822 +879.3821 +1002.169 +1017.72 +1314.62 +1163.53 +1280.118 +1057.017 +996.5626 +852.7265 +931.4484 +1148.741 +1003.375 +724.371 +705.82 +702.9412 +1127.324 +872.7341 +1076.682 +670.8976 +780.61 +744.7898 +783.388 +914.7845 +1040.826 +1051.93 +761.7676 +832.3105 +908.0515 +895.1397 +971.8808 +752.4498 +974.9546 +1085.053 +841.9371 +831.6168 +849.8414 +546.5731 +498.96 +469.8827 +455.0622 +491.5138 +507.5872 +428.873 +443.4402 +789.3212 +660.4444 +1239.67 +1085.112 +907.854 +846.3008 +945.4285 +1001.766 +1311.837 +1272.959 +1250.577 +650.8997 +1141.757 +750.0516 +1196.776 +1124.665 +1063.124 +1033.972 +944.8459 +1308.519 +1146.458 +906.2837 +795.4592 +1046.687 +811.626 +814.9235 +1097.924 +1103.644 +1122.246 +984.9705 +836.742 +1145.797 +1182.597 +1186.724 +1162.612 +959.9593 +1036.084 +1102.196 +978.9963 +890.4495 +942.0818 +789.3835 +1080.075 +705.8372 +724.0443 +1339.549 +873.0168 +713.1848 +617.6449 +712.4843 +871.1567 +1098.372 +831.6331 +833.0364 +749.2269 +738.3104 +1092.491 +770.9082 +823.0854 +1227.936 +1092.892 +724.833 +772.783 +797.8812 +485.8925 +499.8242 +551.5162 +476.2053 +517.8725 +531.2783 +497.5518 +488.1472 +807.0476 +701.3143 +888.8832 +1393.352 +1326.151 +921.1512 +853.0538 +965.182 +1423.572 +1503.862 +1503.593 +936.7556 +842.2346 +774.3765 +1272.843 +1400.563 +1157.261 +1261.379 +1260.583 +956.2595 +779.2557 +837.005 +1038.904 +1162.002 +1058.721 +909.4827 +1390.336 +1371.252 +1061.613 +841.7349 +936.9955 +998.7132 +908.0098 +1071.619 +1126.292 +1198.261 +1151.277 +1139.748 +1141.682 +861.0515 +917.148 +947.075 +850.2862 +798.0688 +617.507 +1169.277 +690.0053 +778.994 +747.6212 +753.3088 +907.7788 +826.6711 +909.8475 +743.3541 +811.0209 +985.3462 +988.0807 +1024.302 +982.993 +829.8824 +1315.609 +645.6245 +673.2892 +722.0616 +614.9409 +621.9128 +523.2047 +487.5552 +521.6795 +528.3168 +501.1715 +475.614 +903.3972 +701.156 +1029.926 +1031.716 +1224.891 +1182.165 +1053.382 +639.4634 +1323.728 +1384.894 +1398.781 +1007.997 +1130.182 +934.4955 +887.6378 +1073.311 +826.664 +959.215 +1096.999 +1313.901 +1212.224 +1022.285 +755.0198 +1118.937 +744.7339 +824.9138 +1022.854 +1252.923 +875.7835 +999.0955 +999.3553 +1315.586 +1174.798 +964.402 +916.3203 +1124.33 +941.7795 +1133.021 +1415.727 +1024.017 +604.7788 +688.8501 +949.5489 +1036.247 +716.5887 +779.1971 +730.3055 +645.1147 +744.4713 +711.8455 +801.066 +891.4916 +909.8441 +703.9511 +650.4846 +986.2018 +940.6403 +921.2119 +861.5848 +1041.066 +1149.843 +550.1105 +574.6282 +617.8572 +531.1967 +575.5292 +472.3337 +468.8463 +517.312 +503.3908 +447.2728 +441.1583 +933.6133 +872.2114 +1091.354 +1069.688 +1101.303 +968.4452 +1008.714 +825.147 +1388.132 +1105.845 +1267.137 +1190.245 +888.6013 +1310.937 +1300.236 +1397.241 +1131.575 +1064.096 +1086.724 +1383.121 +1473.805 +1521.129 +1204.407 +1052.723 +1307.829 +1565.898 +1398.327 +1117.484 +960.335 +1258.475 +1342.272 +1286.263 +1350.666 +1477.483 +1216.301 +1295.975 +1651.128 +1631.661 +1631.458 +969.8112 +1271.199 +1301.438 +1464.703 +1571.308 +1463.559 +1212.318 +1130.887 +1052.808 +1040.117 +888.212 +795.7513 +907.6837 +1340.485 +971.8496 +937.4142 +1386.708 +1293.792 +950.9255 +1184.419 +993.1599 +1113.511 +1430.226 +1572.509 +853.1278 +1013.311 +925.6786 +971.7833 +886.8553 +937.8654 +964.9137 +866.9833 +915.3342 +959.72 +871.9211 +724.5582 +729.3745 +151.1773 +316.6798 +1085.993 +1260.428 +1207.46 +844.9676 +913.6192 +765.166 +1300.191 +1244.936 +1153.55 +1104.025 +914.1596 +1107.852 +1615.816 +1466.488 +1465.201 +1063.803 +978.0754 +1339.347 +1419.744 +1574.139 +1363.275 +1061.068 +1307.083 +1226.972 +1392.401 +1275.893 +1595.966 +1171.385 +1363.056 +1662.41 +1609.005 +1584.56 +897.4273 +1254.449 +1142.445 +1638.496 +1505.339 +1487.225 +1097.331 +1239.628 +1027.537 +986.6719 +1069.204 +934.6566 +1056.683 +1050.275 +973.8691 +1012.036 +1172.721 +1242.617 +1061.273 +1064.752 +1013.048 +1055.987 +1369.436 +1360.979 +1005.319 +1028.314 +1044.169 +1026.681 +1004.981 +943.0963 +911.7972 +694.5577 +758.8915 +679.1378 +706.7498 +584.2632 +502.7685 +132.7935 diff --git a/data/samples/basel_temp_f.csv b/data/samples/basel_temp_f.csv new file mode 100644 index 0000000..e6db2aa --- /dev/null +++ b/data/samples/basel_temp_f.csv @@ -0,0 +1,1024 @@ +9.240245 +7.130245 +6.4902453 +5.2402453 +2.3402452 +0.7302454 +0.040245414 +-0.51975465 +-1.0097545 +-1.7497545 +-2.2697544 +-2.4597545 +-2.5197544 +-2.4597545 +-2.5897546 +-2.5097547 +-2.2197547 +-0.49975455 +1.6002454 +3.4002452 +4.7602453 +5.880245 +7.0902452 +7.8202453 +7.8102455 +6.3202453 +5.380245 +2.3502455 +0.19024539 +-0.9397546 +-1.6097547 +-2.1497545 +-2.6297545 +-2.8497548 +-3.1697545 +-3.3897548 +-3.6497545 +-3.6297545 +-3.7597547 +-3.5597548 +-3.6197548 +-1.4497546 +1.1102455 +2.9802456 +4.4902453 +5.610245 +6.880245 +7.1402454 +7.0102453 +4.2402453 +3.3802452 +2.0402455 +0.7202454 +-0.4497546 +-1.5397545 +-1.9497546 +-2.0897546 +-2.2297544 +-2.1697545 +-2.2497544 +-2.4497547 +-2.7997546 +-2.5097547 +-2.8697548 +-2.4197545 +-0.5697546 +0.19024539 +0.7902454 +1.2902454 +2.0502453 +2.4002454 +2.6902456 +2.5302453 +1.7302454 +1.2702454 +1.1602454 +0.9002454 +0.6202454 +0.34024543 +-0.28975457 +-0.5697546 +-1.0197546 +-1.6297547 +-2.4197545 +-3.1397548 +-3.9497547 +-4.3597546 +-4.5697546 +-4.4197545 +-2.1597548 +-0.35975462 +1.2602453 +2.5802455 +4.0202456 +5.0502453 +5.730245 +5.5902452 +4.1402454 +0.9202454 +-0.28975457 +-1.4397546 +-2.3297548 +-2.8297548 +-3.0197544 +-3.8697548 +-4.0897546 +-4.5697546 +-4.979755 +-5.1797547 +-4.499755 +-4.4197545 +-5.329755 +-5.479755 +-2.8797545 +-0.5397546 +0.7602454 +2.2602453 +3.6402454 +4.920245 +5.7402453 +6.1002455 +5.6602454 +3.7402453 +0.9802454 +-0.72975457 +-1.8697547 +-2.8397546 +-3.5297546 +-4.209755 +-4.6997547 +-5.0097547 +-5.5497546 +-5.809755 +-5.809755 +-5.9097548 +-6.0497546 +-5.809755 +-2.7097545 +0.4902454 +2.8702455 +5.0302453 +6.7002454 +7.9702454 +8.690246 +8.890245 +8.370246 +7.610245 +3.9402456 +0.6802454 +-1.1197546 +-2.0597544 +-2.6997547 +-3.1997547 +-3.8697548 +-4.1697545 +-4.249755 +-4.039755 +-3.769755 +-3.8697548 +-4.079755 +-3.9097548 +-1.0397546 +2.1202455 +4.3302455 +6.2602453 +7.8702455 +9.150246 +10.080245 +10.500245 +10.620245 +8.020246 +3.3602452 +1.9002454 +0.47024542 +-0.48975456 +-0.4497546 +-1.1097546 +-1.3197546 +-1.3897547 +-1.5797547 +-1.7597545 +-1.4797546 +-1.1597546 +-0.9197546 +-0.48975456 +1.8902454 +4.4602456 +6.8102455 +8.300245 +8.690246 +9.3102455 +9.020246 +7.860245 +8.180245 +6.230245 +6.3002453 +6.0302453 +5.8202453 +5.6402454 +6.3102455 +6.3102455 +6.290245 +6.3202453 +6.2802453 +6.3702455 +6.250245 +6.270245 +6.380245 +6.4902453 +6.980245 +7.5302453 +8.260245 +8.8102455 +9.220245 +9.600245 +9.400246 +9.370245 +9.020246 +8.020246 +7.2602453 +7.3902454 +6.980245 +6.6602454 +6.150245 +5.6402454 +5.270245 +5.3002453 +5.4302454 +5.400245 +5.4102454 +5.650245 +5.6002455 +5.770245 +6.3702455 +7.1002455 +9.280245 +10.620245 +12.020246 +12.500245 +12.5602455 +12.5602455 +12.080245 +10.520246 +9.380245 +8.860246 +9.110246 +9.240245 +8.420245 +8.3102455 +8.130245 +7.500245 +7.3002453 +7.940245 +7.9702454 +7.6202455 +7.6402454 +7.630245 +8.780245 +10.340245 +11.420245 +12.600245 +13.720245 +14.390245 +14.970245 +15.120245 +14.120245 +13.910245 +10.930245 +7.480245 +6.210245 +5.8002453 +5.420245 +4.860245 +4.360245 +4.0902452 +3.7802453 +3.4602456 +3.3202453 +2.9102454 +2.7602453 +2.8402452 +5.540245 +9.180245 +12.070246 +14.150246 +15.850245 +16.900246 +17.590246 +17.790245 +16.560246 +14.960245 +13.390245 +12.680245 +9.790245 +8.380245 +8.010245 +7.6202455 +6.940245 +6.920245 +6.3702455 +5.920245 +5.750245 +5.670245 +5.5802455 +5.8202453 +7.7202454 +9.670245 +11.020246 +12.260245 +13.460245 +13.990245 +14.450245 +14.530245 +13.910245 +10.640245 +8.930245 +7.6402454 +6.380245 +5.500245 +5.500245 +5.0502453 +4.5002456 +4.5302453 +4.170245 +3.5402455 +3.7502456 +3.4602456 +3.3202453 +4.0702453 +7.0102453 +9.920245 +11.610246 +13.330245 +14.850245 +15.470245 +15.300245 +14.960245 +14.290245 +13.600245 +12.470245 +12.000245 +11.790245 +11.410245 +10.850245 +10.710245 +10.590245 +10.160245 +9.360246 +9.0602455 +8.790245 +7.9702454 +7.5102453 +7.1002455 +7.5802455 +8.250245 +9.250245 +9.750245 +10.160245 +10.430245 +10.8102455 +10.470245 +9.950245 +8.520246 +6.940245 +7.290245 +6.6202455 +5.8702455 +3.8902454 +2.7702456 +1.5802454 +1.3202454 +1.2002454 +0.7102454 +0.4202454 +-0.21975458 +-0.7197546 +0.7502454 +3.6202455 +6.1602454 +8.420245 +9.660245 +10.190246 +10.770246 +11.470245 +11.8102455 +11.400246 +10.240245 +9.130245 +9.120245 +8.0602455 +7.770245 +9.130245 +8.980246 +9.090245 +9.460245 +9.910245 +9.640245 +10.360246 +10.580245 +10.630245 +10.680245 +11.020246 +11.380245 +11.210245 +11.990245 +12.340245 +12.970245 +12.980246 +12.470245 +12.330245 +12.0602455 +12.110246 +12.370245 +12.250245 +12.380245 +12.050245 +11.900246 +11.880245 +11.5602455 +11.600245 +11.230246 +10.770246 +10.630245 +10.770246 +11.010245 +11.240245 +11.400246 +11.600245 +11.770246 +11.630245 +12.260245 +13.140245 +13.040245 +12.720245 +12.270246 +11.670245 +11.090245 +10.780245 +10.770246 +10.710245 +10.610246 +10.270246 +10.260245 +10.050245 +9.970245 +9.920245 +9.910245 +10.230246 +10.370245 +10.420245 +10.670245 +10.850245 +11.420245 +11.640245 +12.0602455 +12.470245 +12.690246 +12.260245 +11.760245 +11.070246 +10.800245 +10.510245 +10.470245 +10.480246 +10.320246 +10.080245 +9.820246 +9.450245 +9.550245 +9.850245 +10.200245 +10.500245 +10.360246 +10.690246 +10.720245 +10.670245 +11.130245 +11.360246 +11.5602455 +11.000245 +8.980246 +7.420245 +6.610245 +5.6402454 +4.8102455 +3.9702454 +3.2402453 +3.3302455 +2.6502452 +2.0902452 +1.6602454 +1.6402454 +1.1902454 +1.3602455 +1.3702455 +1.6502454 +1.9002454 +2.2602453 +2.5302453 +3.9302454 +4.5602455 +4.750245 +5.1802454 +5.4102454 +5.5102453 +4.8002453 +3.4802456 +2.2002454 +1.7802454 +1.6302454 +1.7602454 +1.7902454 +1.7702454 +1.8202454 +1.5402454 +1.1402454 +1.0902455 +0.9002454 +0.7202454 +0.4002454 +0.2902454 +0.5902454 +1.2302454 +1.8402454 +2.7202454 +3.1002455 +3.5602455 +3.5102453 +3.3602452 +2.8702455 +2.5802455 +1.8302454 +1.0102454 +-0.04975462 +-0.9297546 +-2.1397548 +-2.9497547 +-3.3997545 +-3.729755 +-4.389755 +-4.619755 +-4.639755 +-4.7397547 +-4.9097548 +-2.6997547 +-0.75975454 +0.8502454 +2.4902453 +3.8102455 +4.8502455 +5.520245 +6.0802455 +6.230245 +5.7202454 +3.3302455 +0.5702454 +-0.11975461 +-0.5897546 +-1.2397546 +-1.8197545 +-1.6597546 +-1.3597546 +-2.1197548 +-2.5697546 +-2.8997545 +-2.9997544 +-2.3497548 +-2.5297546 +-1.1197546 +0.7602454 +2.2202454 +3.8102455 +5.0702453 +5.9502454 +6.5102453 +6.920245 +6.900245 +6.040245 +4.8002453 +3.3702455 +2.1802454 +1.9702454 +2.2502456 +3.1702452 +3.1302452 +3.1102452 +2.9702454 +2.8402452 +2.5702453 +2.5702453 +2.3402452 +2.5602455 +2.7102456 +3.5702453 +4.730245 +5.3202453 +6.3702455 +7.3402452 +7.920245 +8.320246 +8.290245 +8.220245 +7.710245 +6.3002453 +4.750245 +3.4702454 +2.5402455 +2.5102453 +1.3602455 +0.8102454 +0.6602454 +0.2902454 +0.4002454 +-0.019754589 +0.20024541 +0.3902454 +1.5802454 +3.4002452 +5.6402454 +7.670245 +9.380245 +10.690246 +11.490245 +12.090245 +12.350245 +11.740245 +10.240245 +9.100245 +8.650246 +9.000245 +9.070246 +8.700245 +8.390245 +8.070246 +7.5602455 +6.8102455 +6.6402454 +6.2202454 +5.3202453 +4.8302455 +5.5802455 +7.5802455 +8.590245 +10.000245 +11.000245 +11.240245 +11.300245 +11.620245 +11.180245 +10.170245 +9.230246 +8.610246 +8.140245 +8.170245 +8.010245 +7.8502455 +7.710245 +7.650245 +7.4502454 +7.3902454 +7.6602454 +7.4902453 +7.0102453 +6.7202454 +6.6602454 +7.000245 +7.540245 +8.100245 +9.040245 +9.770246 +10.360246 +9.830245 +9.760245 +9.760245 +8.660245 +8.630245 +9.020246 +9.770246 +10.330245 +11.030245 +11.300245 +11.330245 +11.460245 +11.800245 +11.880245 +11.590245 +11.530245 +11.250245 +11.130245 +11.510245 +11.670245 +12.100245 +11.850245 +11.630245 +11.580245 +11.840245 +10.340245 +9.890245 +9.170245 +8.370246 +7.6202455 +7.3502455 +7.1402454 +6.610245 +6.3702455 +6.190245 +6.0802455 +6.170245 +6.210245 +6.230245 +6.0602455 +6.4302454 +6.5702453 +7.1002455 +7.7002454 +8.500245 +8.940246 +9.590245 +9.980246 +9.980246 +9.920245 +9.480246 +9.180245 +8.420245 +7.860245 +7.7202454 +7.8902454 +7.8102455 +7.960245 +8.140245 +8.300245 +8.130245 +8.320246 +8.630245 +9.260245 +9.530245 +9.800245 +10.160245 +10.530245 +11.440246 +12.460245 +13.440246 +13.720245 +14.070246 +14.110246 +13.880245 +13.250245 +11.080245 +11.510245 +10.930245 +8.640245 +8.140245 +7.8202453 +7.3202453 +6.2202454 +5.020245 +4.8902454 +4.880245 +4.4602456 +4.0402455 +5.730245 +8.440246 +10.630245 +12.630245 +14.220245 +15.570246 +16.470245 +17.040245 +17.180244 +16.670246 +15.480246 +12.930245 +11.440246 +10.540245 +10.620245 +12.370245 +12.720245 +11.420245 +10.570246 +10.3102455 +10.110246 +9.680245 +9.670245 +9.610246 +9.700245 +10.380245 +10.730246 +10.900246 +11.100245 +11.420245 +11.270246 +10.620245 +10.720245 +10.850245 +9.550245 +9.260245 +8.750245 +8.700245 +8.460245 +8.460245 +8.160246 +7.920245 +7.8402452 +8.030246 +8.190246 +7.9702454 +7.670245 +7.5702453 +7.650245 +7.610245 +7.8002453 +7.440245 +7.1602454 +6.650245 +6.540245 +6.460245 +6.5302453 +6.7002454 +6.3702455 +5.0302453 +5.0102453 +4.2202454 +3.8002453 +4.3502455 +3.9102454 +3.0502453 +2.5202456 +2.1502454 +1.5202454 +1.2302454 +1.6302454 +1.1702454 +2.4402454 +3.5302453 +4.3702455 +4.8402452 +5.400245 +5.8502455 +6.040245 +6.6602454 +6.920245 +6.6202455 +5.420245 +1.9802454 +0.6502454 +0.17024541 +1.0002453 +0.9002454 +0.4902454 +0.070245415 +-0.17975461 +-0.5997546 +-0.9597546 +-1.2897546 +-1.3297546 +-0.99975455 +-0.03975457 +1.7302454 +3.0902452 +4.360245 +4.750245 +4.5702453 +4.8502455 +5.210245 +5.150245 +4.9102454 +3.5502453 +3.1202455 +2.6002455 +2.1302454 +1.1102455 +0.6102454 +0.23024541 +0.17024541 +0.16024542 +0.060245395 +-0.03975457 +-0.029754579 +0.050245404 +-0.62975454 +0.4802454 +1.9702454 +3.0302453 +3.8302455 +4.5602455 +5.420245 +5.8002453 +5.6802454 +5.6602454 +5.3202453 +4.690245 +4.2202454 +3.7902455 +3.6802454 +3.5102453 +3.4902453 +3.4102454 +3.3302455 +3.0802455 +3.0002456 +3.0302453 +3.0702453 +2.9402456 +2.8802452 +2.8702455 +2.8902454 +2.9002452 +3.3102455 +3.7702456 +3.9602456 +3.5302453 +3.2302456 +2.8602452 +3.2602453 +3.5502453 +3.1002455 +2.1002455 +1.7502455 +1.5102453 +1.9002454 +2.1602454 +2.4702454 +2.2102454 +2.2302454 +2.1502454 +1.9202454 +1.7602454 +1.8302454 +2.5102453 +3.1602454 +3.7902455 +4.110245 +4.170245 +3.7502456 +4.1602454 +4.8402452 +4.8302455 +4.2602453 +3.6502452 +2.3702455 +2.3502455 +1.6602454 +0.8302454 +0.9302454 +0.2502454 +0.090245396 +-0.48975456 +0.010245383 +0.5702454 +0.8102454 +1.4302454 +1.1902454 +1.3302454 +1.3202454 +1.5802454 +2.0002456 +2.0802455 +2.4802454 +2.2202454 +1.9402454 +2.2502456 +1.7402455 +1.6102453 +0.4802454 +-0.49975455 +-1.3897547 +-1.8897547 +-1.8297547 +-2.3197546 +-2.4197545 +-2.4597545 +-3.3197546 +-4.209755 +-4.6097546 +-4.559755 +-3.4297547 +-1.0797546 +-0.10975462 +1.1102455 +1.9202454 +2.0102453 +1.6702454 +1.7302454 +2.2002454 +2.5102453 +1.2402453 +1.2202454 +1.0902455 +0.9402454 +0.7102454 +0.6102454 +0.7602454 +0.5002454 +0.4202454 +0.23024541 +0.3902454 +0.3102454 +0.100245416 +0.100245416 +-0.27975458 +-0.019754589 +0.47024542 +1.4602454 +2.1202455 +2.8502455 +3.2602453 +3.7602453 +3.7602453 +3.4002452 +3.1602454 +2.5702453 +1.8002454 +1.7802454 +2.2002454 +2.1602454 +2.0002456 +1.9702454 +2.1602454 +2.2202454 +2.1102452 +1.8302454 +2.0202456 +2.0702453 +2.0802455 diff --git a/data/samples/basel_wind_f.csv b/data/samples/basel_wind_f.csv new file mode 100644 index 0000000..f14b79c --- /dev/null +++ b/data/samples/basel_wind_f.csv @@ -0,0 +1,1024 @@ +3.319036 +2.1897945 +2.7416782 +2.9024127 +2.6208394 +2.8799999 +2.9024127 +3.319036 +3.319036 +3.319036 +3.319036 +3.2599385 +4.452954 +4.3498964 +5.6233797 +5.6002855 +4.39436 +7.491114 +8.209263 +6.9247384 +6.439876 +5.95906 +5.634891 +4.6800003 +4.452954 +4.73506 +5.0911684 +4.73506 +5.4119864 +5.0911684 +5.0528407 +4.73506 +4.452954 +5.1544156 +6.130579 +7.5685663 +7.8625183 +7.342588 +6.489992 +6.8399997 +5.506941 +5.0142193 +5.4477882 +5.771239 +6.519877 +6.130579 +5.4477882 +6.28713 +5.4833565 +5.154415 +5.634891 +5.4833565 +5.6920996 +5.8603754 +5.8603754 +5.2416787 +5.5887027 +5.5887027 +6.2145634 +6.0347 +5.3999996 +4.68 +6.28713 +5.315336 +5.991594 +5.991594 +4.6938257 +5.0142193 +5.4119864 +5.3999996 +5.4477882 +5.4119864 +5.506941 +5.1544156 +4.0249224 +4.0249224 +3.8773184 +3.319036 +4.3349743 +3.5455887 +3.4152596 +4.2136917 +4.0249224 +2.5455842 +2.1897945 +4.39436 +2.8116899 +2.8116899 +4.39436 +3.8268526 +4.3349743 +5.1165614 +5.1919937 +4.198285 +4.198285 +5.0142193 +5.634891 +6.989936 +6.9153743 +7.491114 +7.289445 +7.289445 +7.5942082 +7.729527 +9.178235 +10.495713 +10.630672 +8.654986 +8.311245 +9.746631 +11.252519 +11.503113 +12.55879 +13.896187 +14.264361 +12.287555 +10.440001 +10.799999 +10.805999 +9.826088 +11.212135 +12.287555 +13.324863 +10.308831 +9.085988 +8.913181 +9.290511 +7.9932966 +7.289445 +8.049845 +9.885262 +10.703569 +11.720751 +11.570515 +11.252519 +10.464798 +6.2145634 +5.2416787 +3.6179552 +3.0758414 +3.8268526 +4.5820518 +5.3517847 +6.8777895 +7.386582 +7.9932966 +8.587338 +9.504273 +10.829959 +12.979984 +16.981165 +20.674156 +22.915916 +23.469128 +21.566975 +16.24394 +16.746773 +19.05281 +18.345877 +15.937878 +14.458382 +10.966713 +7.6367526 +6.763786 +7.628263 +8.089993 +8.049845 +8.373386 +8.707237 +8.049845 +6.489992 +5.8048253 +5.7599998 +5.4477882 +5.5887027 +5.6920996 +5.3517847 +5.0142193 +5.4833565 +4.2136917 +3.7064266 +4.0249224 +4.198285 +3.7064266 +5.3517847 +7.42159 +5.1165614 +4.5820518 +5.6233797 +5.8603754 +5.8048253 +5.95906 +6.989936 +6.6087217 +7.5685663 +7.8954163 +7.5685663 +7.4128532 +7.10031 +6.6087217 +6.9247384 +8.225035 +9.107359 +9.885262 +10.464798 +11.159999 +12.682018 +12.371645 +11.177405 +11.440979 +11.298495 +11.384198 +10.883676 +9.511088 +8.225035 +7.8954163 +7.072878 +6.162207 +5.7599998 +5.4119864 +5.0911684 +5.506941 +5.5887027 +6.489992 +7.0911775 +7.24486 +7.628263 +7.172949 +7.072878 +7.42159 +7.42159 +6.725354 +5.154415 +6.830519 +6.763786 +6.9247384 +6.9247384 +7.10031 +7.10031 +7.5685663 +7.5685663 +7.10031 +7.289445 +7.289445 +7.5942082 +7.289445 +7.491114 +7.5942082 +7.289445 +7.386582 +7.386582 +6.3690495 +6.4096174 +6.9153743 +7.2 +6.297428 +6.618519 +7.127636 +7.8954163 +6.9153743 +6.638072 +6.989936 +7.10031 +5.3517847 +5.0142193 +9.0 +11.983188 +9.290511 +10.799999 +11.709688 +11.966954 +13.276144 +10.799999 +11.200571 +12.682018 +11.212135 +11.384198 +10.948973 +9.19939 +7.8954163 +6.489992 +5.6002855 +5.3999996 +5.154415 +5.0142193 +4.829907 +5.00128 +4.829907 +3.5455887 +0.71999997 +0.0 +0.35999998 +0.5091169 +1.5273507 +1.5273507 +2.3051248 +3.319036 +3.219938 +3.671294 +4.6800003 +5.506941 +5.8603754 +5.3517847 +4.896529 +4.802999 +4.5820518 +5.634891 +6.763786 +7.2 +6.3690495 +6.1305785 +5.692099 +7.491114 +6.696387 +6.193674 +5.95906 +5.8603754 +5.8048253 +6.2145634 +6.379216 +6.6185193 +6.9527545 +6.9527545 +6.439876 +7.0911775 +6.830519 +5.8603754 +5.506941 +4.73506 +4.73506 +4.73506 +4.5536795 +4.6800003 +6.489992 +7.289445 +7.4128532 +7.5685663 +7.628263 +6.489992 +6.28713 +7.24486 +6.151683 +7.0911775 +7.24486 +6.9247384 +8.587338 +9.021574 +8.089993 +9.346143 +10.390226 +10.703569 +11.298495 +9.107359 +9.028754 +10.00256 +9.360001 +8.891344 +8.534353 +6.9247384 +8.209263 +8.089993 +7.42159 +8.089993 +8.854829 +7.729527 +6.379216 +8.049845 +10.799999 +11.885453 +11.525623 +11.620809 +11.631956 +9.6932955 +7.968939 +8.707238 +9.227524 +7.729527 +5.904439 +6.792466 +6.12 +4.510787 +4.510787 +4.2136917 +6.519877 +6.5693827 +7.5599995 +7.6367526 +7.8625183 +7.628263 +7.5685663 +7.172949 +6.8399997 +6.4799995 +7.5685663 +9.449572 +9.693296 +11.298495 +16.80857 +16.78156 +11.480557 +9.793059 +10.009036 +9.0 +9.3669195 +11.879999 +12.24 +13.358861 +12.682018 +14.654254 +16.992609 +18.875126 +18.440998 +17.148247 +16.80857 +15.281989 +14.003028 +13.493999 +13.755579 +15.349684 +15.30741 +16.279802 +17.414474 +18.193361 +16.610792 +14.277983 +12.475961 +13.089354 +14.182355 +13.849477 +14.186923 +15.072783 +15.696165 +17.10284 +17.581125 +17.935081 +18.218275 +17.673029 +17.174677 +16.981165 +16.575644 +16.363178 +16.055355 +16.323528 +14.904173 +17.072504 +17.786331 +16.418526 +16.676977 +14.081477 +11.592894 +9.957108 +8.654987 +8.161764 +9.0 +8.707237 +6.830519 +7.2359104 +7.2805495 +6.5693827 +6.2145634 +6.2145634 +5.8603754 +5.0911684 +5.0911684 +5.4119864 +6.8399997 +7.928178 +6.830519 +5.904439 +5.95906 +6.9899354 +6.4799995 +3.9763298 +5.495161 +5.315336 +5.0911684 +5.04 +5.0528407 +4.3349743 +5.4119864 +4.3349743 +4.32 +3.96 +3.6 +3.6 +3.6179552 +3.2599385 +4.379589 +5.506941 +6.162207 +6.4799995 +6.12 +6.2145634 +6.6087217 +9.726664 +12.096214 +10.703569 +10.587918 +12.979984 +15.778516 +17.935081 +19.008545 +21.603 +19.376562 +22.392464 +22.148046 +22.91026 +25.741888 +25.922499 +26.42753 +25.364037 +24.40046 +22.81957 +20.929596 +20.478907 +16.700275 +16.263872 +15.696165 +15.937878 +14.345898 +13.1041975 +8.049845 +8.714677 +7.2 +3.396233 +4.1046314 +4.0249224 +1.1384199 +1.1384199 +2.7416782 +3.7585104 +3.7585104 +3.7585104 +3.5455887 +3.6179552 +3.96 +4.0249224 +3.5455887 +3.096837 +3.0547013 +3.6 +2.5959969 +1.8 +1.8 +2.5455842 +3.6 +4.072935 +4.1046314 +3.096837 +3.319036 +3.319036 +3.8268526 +3.6 +3.8939953 +4.3349743 +4.6102495 +4.8965297 +5.495161 +5.692099 +5.1919937 +5.1919937 +5.95906 +4.802999 +3.671294 +3.671294 +3.7585104 +3.8939953 +5.00128 +5.00128 +6.28713 +5.506941 +5.8048253 +4.802999 +4.2136917 +3.7064266 +3.0547013 +2.8116899 +2.0991426 +1.9386592 +1.2979984 +1.609969 +2.8116899 +3.7064266 +3.396233 +5.3999996 +5.00128 +4.829907 +5.3517847 +5.6233797 +3.7064266 +2.8116899 +2.7416782 +2.0991426 +1.2979984 +1.8 +3.7585104 +4.802999 +4.0249224 +3.219938 +4.6800003 +2.2768397 +2.2768397 +3.0758414 +3.396233 +2.9024127 +3.396233 +3.319036 +3.7585104 +3.0758414 +2.968636 +3.0758414 +3.671294 +5.04 +6.489992 +5.7599998 +2.9024127 +3.4152596 +6.0347 +3.8268526 +3.096837 +4.5536795 +4.510787 +4.39436 +3.219938 +4.2136917 +5.4119864 +5.506941 +5.3517847 +5.2416787 +4.802999 +6.28713 +6.725354 +6.379216 +7.5170207 +8.311245 +7.628263 +6.763786 +7.7879906 +8.14587 +7.127636 +6.489992 +7.4128532 +6.763786 +6.830519 +6.489992 +5.5887027 +5.1544156 +5.7599998 +3.319036 +4.452954 +4.452954 +3.219938 +3.0547013 +3.319036 +3.6 +3.7064266 +3.6 +3.2399998 +3.671294 +1.8 +1.484318 +1.2979984 +2.52 +3.2599385 +5.4119864 +5.5887027 +2.9024127 +1.8 +1.2979984 +1.835647 +2.1897945 +2.4149535 +3.396233 +3.319036 +3.2599385 +2.2768397 +2.5455842 +2.6208394 +2.968636 +2.9024127 +2.52 +2.7416782 +2.6208394 +1.9386592 +1.9386592 +2.2768397 +2.1897945 +2.5959969 +4.072935 +3.096837 +2.2768397 +2.6208394 +2.5455842 +1.484318 +2.5959969 +1.4399999 +2.2768397 +2.5455842 +2.5455842 +2.9024127 +4.2136917 +4.510787 +5.1919937 +6.193674 +6.193674 +6.792466 +7.5942082 +8.669949 +10.163227 +10.308831 +8.473393 +8.825508 +8.759178 +6.9899354 +6.2145634 +7.2805495 +7.5942073 +7.342588 +5.95906 +5.154415 +3.8939953 +3.8939953 +3.6 +3.8268526 +5.2416787 +4.1046314 +2.8116899 +2.9024127 +4.0249224 +5.1544156 +5.95906 +9.957108 +10.883676 +12.181624 +13.479583 +13.979872 +12.783802 +8.913181 +10.195057 +11.246759 +10.990322 +12.4811535 +14.336861 +13.708391 +16.267857 +18.03237 +19.826164 +21.178896 +22.288042 +22.9724 +22.66857 +23.664352 +24.27265 +24.066206 +23.979893 +23.185787 +23.979893 +21.868332 +26.87491 +27.887802 +27.66384 +27.859905 +29.354904 +29.46287 +26.478971 +21.794127 +17.935081 +15.034041 +9.585739 +6.297428 +5.2416787 +6.8777895 +8.714677 +9.178235 +8.913181 +9.19939 +10.390226 +9.779817 +11.159999 +7.5599995 +7.928178 +8.287822 +8.557102 +7.342588 +5.8048253 +4.73506 +3.8268526 +3.219938 +3.319036 +4.379589 +4.6938252 +4.3349743 +3.9763298 +3.96 +4.32 +4.68 +3.96 +4.0249224 +3.7064266 +4.198285 +5.495161 +6.1305785 +5.3517847 +5.904439 +7.2 +7.5942082 +7.2 +7.386582 +4.8965297 +6.763786 +4.39436 +4.198285 +5.1919937 +5.8048253 +5.991594 +5.3999996 +6.4096174 +5.991594 +5.692099 +6.4096174 +6.792466 +6.4096174 +6.618519 +5.6233797 +3.7064266 +2.1897945 +5.04 +8.089993 +9.793058 +11.901798 +11.183201 +12.144331 +9.6932955 +6.725354 +6.5693827 +6.9527545 +7.9036193 +8.209263 +8.669949 +9.686609 +7.491114 +8.699793 +7.968939 +6.130579 +6.5693827 +6.28713 +7.8954163 +7.0911775 +8.089993 +9.565437 +10.483357 +11.013882 +11.966953 +12.261158 +13.698934 +14.973576 +15.725037 +16.055353 +15.141414 +13.797913 +14.759999 +16.179985 +16.267857 +18.359999 +19.652176 +21.868332 +21.485697 +22.392464 +23.507751 +23.424908 +23.424908 +22.961115 +23.277834 +23.269482 +22.782625 +21.178896 +23.358423 +23.510508 +22.33161 +22.007162 +21.316135 +15.856356 +10.805999 +10.086427 +7.928178 +6.489992 +5.8603754 +7.9932976 +6.6087217 +6.3690495 +5.3517847 +3.563818 +3.8773184 +2.6208394 +1.1384199 +1.2979984 +2.5455842 +3.219938 +2.8799999 +3.671294 +5.154415 +5.8048253 +6.1305785 +6.3690495 +5.991594 +5.2416787 +9.0 +9.021574 +8.404285 +7.289445 +6.439876 +4.6800003 +4.802999 +5.0911684 +4.3349743 +4.32 +4.802999 +4.896529 +5.1544156 +4.452954 +4.379589 +4.6938252 +5.0528407 +5.5887027 +6.763786 +8.0899935 +8.587338 +8.161764 +6.193674 +7.5942082 +8.669949 +7.9036193 +6.9527545 +5.3517847 +4.452954 +4.3349743 +4.3349743 +3.6179552 +3.96 +3.96 +3.6179552 +3.9763298 +3.6179552 +3.9763298 +4.0249224 +4.6800003 +3.9763298 +4.1046314 +3.96 +4.73506 +5.0911684 +5.00128 +4.2136917 +6.0347 +5.3999996 +5.315336 +5.3517847 +4.5536795 +4.379589 +4.802999 +4.379589 +3.9763298 +3.9763298 +4.379589 +4.379589 +4.452954 +4.452954 +4.452954 +4.452954 +3.671294 +3.5455887 +4.510787 +5.692099 +6.297428 +6.9153743 +6.8777895 +4.6800003 +7.968939 +7.8954163 +7.24486 +6.28713 +5.634891 +5.495161 +5.315336 +4.829907 +5.3517847 +6.151683 +6.151683 +5.815978 +5.815978 +5.3517847 +5.3517847 +5.3517847 +3.6 +3.8268526 +5.8603754 +6.696387 +7.7879906 +8.78872 +8.209263 +5.6002855 +6.3690495 +6.4096174 +7.0911775 +6.489992 +5.2416787 +4.452954 +4.0249224 +3.2599385 +3.2399998 +2.8799999 +2.9024127 +3.2599385 +3.671294 +3.671294 +2.9024127 +2.5455842 +3.2599385 +3.0758414 +4.6102495 +5.3999996 +6.696387 +7.289445 +7.24486 diff --git a/data/samples/bird_migration_f.csv b/data/samples/bird_migration_f.csv new file mode 100644 index 0000000..87191df --- /dev/null +++ b/data/samples/bird_migration_f.csv @@ -0,0 +1,1024 @@ +8.3495 +8.56067 +7.86233 +7.883 +7.86233 +7.862 +7.86217 +7.86183 +7.86233 +7.8675 +7.87067 +7.97617 +7.97617 +7.97967 +7.99767 +8.0015 +8.01367 +8.00583 +7.9915 +8.0065 +7.9695 +8.002 +8.00467 +7.99117 +8.00467 +8.00583 +8.00883 +8.0115 +8.03267 +8.02833 +8.0285 +7.99967 +8.01383 +8.03117 +8.00483 +8.03233 +8.01033 +8.01617 +7.98467 +7.97217 +7.985 +7.98267 +7.9825 +7.99067 +7.9905 +7.96817 +7.94583 +7.8905 +7.90283 +7.90683 +7.91617 +7.94683 +7.95533 +7.94433 +7.94217 +7.9525 +7.9475 +7.9475 +7.94633 +7.94183 +7.9445 +7.9445 +7.94433 +7.9445 +7.94233 +7.9445 +7.94233 +7.94417 +7.9445 +7.94283 +7.94217 +7.94283 +7.94283 +7.94217 +7.944 +7.942 +7.9445 +7.94283 +7.9475 +7.94183 +7.94433 +7.942 +7.9445 +7.9475 +7.95 +7.93067 +7.93633 +8.04433 +8.06183 +8.05933 +8.04633 +8.05417 +8.05317 +8.05567 +8.0545 +8.05817 +8.04917 +8.03933 +8.06783 +8.04667 +8.0515 +8.044 +8.05183 +8.0365 +8.04183 +8.04683 +8.04133 +8.05767 +8.04267 +8.05183 +8.06633 +8.03233 +8.0635 +8.06417 +8.0615 +8.062 +8.0425 +8.0635 +8.03883 +8.09433 +8.07617 +8.07383 +8.09167 +8.072 +8.073 +8.09383 +8.089 +8.091 +8.09483 +8.08367 +8.06867 +8.09833 +8.07 +8.07633 +8.0695 +8.07633 +8.07333 +8.07917 +8.07933 +8.074 +8.09617 +8.086 +8.07167 +8.072 +8.0715 +8.0845 +8.072 +8.09733 +8.0755 +8.09517 +8.09283 +8.07483 +8.09883 +8.09833 +8.06867 +8.0945 +8.0915 +8.09967 +8.09767 +8.07067 +8.09333 +8.09083 +8.08233 +8.098 +8.10017 +8.08167 +8.0745 +8.09183 +8.08867 +8.08867 +8.0915 +8.08467 +8.093 +8.0895 +8.07817 +8.09617 +8.07817 +8.09483 +8.08183 +8.08867 +8.0815 +8.08867 +8.08133 +8.08867 +8.08867 +8.07717 +8.07067 +8.07483 +8.07483 +8.07383 +8.07167 +8.07117 +8.072 +8.06783 +8.076 +8.07 +8.11233 +8.102 +8.11183 +8.11183 +8.103 +8.103 +8.1005 +8.10183 +8.10283 +8.1615 +8.08583 +8.087 +8.08767 +8.08983 +8.10483 +8.0895 +8.05833 +8.068 +8.05883 +8.059 +8.05933 +8.05917 +8.05917 +8.059 +8.057 +8.05917 +8.06 +8.05933 +8.05917 +8.05917 +8.05933 +8.05917 +8.05917 +8.05933 +8.06 +8.05917 +8.059 +8.05983 +8.05917 +8.059 +8.05983 +8.05883 +8.05917 +8.05967 +8.05917 +8.05917 +8.05917 +8.0595 +8.059 +8.059 +8.05967 +8.0595 +8.0595 +8.05883 +8.05867 +8.05867 +8.05633 +8.06017 +8.0585 +8.05883 +8.05967 +8.05883 +8.05883 +8.05883 +8.05883 +8.0585 +8.05867 +8.05867 +8.05867 +8.05933 +8.0595 +8.05967 +8.05967 +8.05867 +8.05867 +8.05967 +8.05967 +8.05883 +8.059 +8.05967 +8.05867 +8.05933 +8.0595 +8.0675 +8.05883 +8.05883 +8.0595 +8.05933 +8.0585 +8.05967 +8.05967 +8.05917 +8.05883 +8.05933 +8.05867 +8.05933 +8.05833 +8.0595 +8.059 +8.059 +8.0595 +8.05883 +8.059 +8.059 +8.05867 +8.05833 +8.059 +8.05883 +8.0585 +8.05833 +8.0585 +8.05917 +8.05883 +8.05867 +8.0585 +8.05917 +8.0585 +8.0655 +8.05833 +8.0575 +8.0585 +8.05883 +8.0585 +8.0585 +8.05783 +8.0585 +8.05867 +8.05833 +8.0585 +8.0585 +8.05867 +8.0585 +8.0585 +8.0585 +8.0585 +8.0585 +8.0585 +8.0585 +8.0585 +8.05967 +8.05833 +8.05833 +8.05833 +8.0585 +8.05933 +8.05833 +8.0585 +8.05833 +8.05833 +8.05933 +8.05833 +8.05933 +8.05833 +8.0595 +8.06 +8.05833 +8.05967 +8.0585 +8.05833 +8.05833 +8.05833 +8.05817 +8.05833 +8.05833 +8.0585 +8.06117 +8.0585 +8.05867 +8.05833 +8.05833 +8.05817 +8.0585 +8.05833 +8.0585 +8.0585 +8.06367 +8.06517 +8.06633 +8.0565 +8.0585 +8.05883 +8.07 +8.05817 +8.05867 +8.0585 +8.0585 +8.0585 +8.0585 +8.0585 +8.05833 +8.05883 +8.05917 +8.07133 +8.06533 +8.05883 +8.05883 +8.05883 +8.06567 +8.058 +8.075 +8.05983 +8.06367 +8.06133 +8.061 +8.0645 +8.05917 +8.06417 +8.06467 +8.06433 +8.064 +8.06417 +8.064 +8.06417 +8.06333 +8.064 +8.06317 +8.06 +8.06333 +8.0635 +8.06317 +8.06317 +8.08583 +8.06317 +8.063 +8.06583 +8.063 +8.0595 +8.05917 +8.064 +8.0615 +8.06067 +8.06067 +8.0685 +8.06217 +8.0635 +8.064 +8.05883 +8.06033 +8.06333 +8.06433 +8.06417 +8.06417 +8.06433 +8.064 +8.064 +8.064 +8.06417 +8.06367 +8.06417 +8.06417 +8.064 +8.06383 +8.0635 +8.06033 +8.05983 +8.06217 +8.06183 +8.06333 +8.05967 +8.05933 +8.05933 +8.064 +8.064 +8.06133 +8.064 +8.064 +8.06433 +8.06433 +8.064 +8.064 +8.065 +8.06417 +8.06383 +8.0635 +8.06417 +8.08033 +8.06533 +8.064 +8.06717 +8.06483 +8.0645 +8.0645 +8.06417 +8.0645 +8.07083 +8.064 +8.06517 +8.06233 +8.06033 +8.0635 +8.06517 +8.063 +8.06267 +8.064 +8.06417 +8.0635 +8.06517 +8.06433 +8.06483 +8.06667 +8.06467 +8.06467 +8.06767 +8.0645 +8.06067 +8.06283 +8.063 +8.06283 +8.0595 +8.06283 +8.06267 +8.063 +8.06267 +8.06267 +8.06267 +8.06283 +8.05883 +8.06283 +8.06233 +8.06267 +8.07 +8.0795 +8.08467 +8.0625 +8.0625 +8.0625 +8.06283 +8.0625 +8.063 +8.071 +8.06267 +8.07917 +8.06283 +8.06333 +8.05867 +8.06233 +8.0585 +8.05883 +8.06267 +8.05883 +8.05867 +8.0625 +8.06267 +8.06267 +8.06267 +8.063 +8.0625 +8.06267 +8.06267 +8.06283 +8.06267 +8.06283 +8.06283 +8.06283 +8.063 +8.06767 +8.06267 +8.06283 +8.06283 +8.06267 +8.063 +8.06267 +8.06317 +8.06267 +8.06317 +8.06267 +8.06283 +8.06267 +8.06267 +8.06267 +8.06283 +8.06283 +8.06267 +8.08117 +8.0625 +8.06283 +8.063 +8.0625 +8.05833 +8.0625 +8.06267 +8.06267 +8.06267 +8.082 +8.07983 +8.06283 +8.06367 +8.06267 +8.0625 +8.0755 +8.063 +8.06267 +8.0635 +8.06267 +8.06267 +8.0635 +8.06283 +8.06267 +8.06267 +8.06483 +8.06267 +8.06367 +8.06567 +8.06283 +8.0625 +8.0635 +8.06283 +8.06267 +8.0635 +8.06283 +8.06267 +8.06267 +8.06283 +8.06283 +8.063 +8.0635 +8.0635 +8.06683 +8.06283 +8.06283 +8.05833 +8.06317 +8.06283 +8.063 +8.0635 +8.06317 +8.063 +8.067 +8.0585 +8.09167 +8.05867 +8.05833 +8.05883 +8.05867 +8.05867 +8.05833 +8.05833 +8.064 +8.05817 +8.05867 +8.06367 +8.05867 +8.0585 +8.0645 +8.0585 +8.0585 +8.0585 +8.0585 +8.059 +8.05883 +8.06367 +8.05883 +8.0585 +8.05833 +8.05817 +8.06233 +8.06217 +8.06267 +8.062 +8.062 +8.06267 +8.06317 +8.06283 +8.063 +8.06267 +8.063 +8.06217 +8.0625 +8.061 +8.0625 +8.06317 +8.0625 +8.06267 +8.061 +8.06267 +8.06367 +8.06333 +8.06083 +8.06167 +8.063 +8.0615 +8.08483 +8.06317 +8.06167 +8.06167 +8.06217 +8.06333 +8.063 +8.06233 +8.062 +8.062 +8.062 +8.06117 +8.06233 +8.062 +8.062 +8.06817 +8.06233 +8.06783 +8.06133 +8.06267 +8.06283 +8.08817 +8.059 +8.06283 +8.067 +8.061 +8.0615 +8.06267 +8.0625 +8.06117 +8.06317 +8.06217 +8.05933 +8.0635 +8.06167 +8.06217 +8.063 +8.06017 +8.06017 +8.06117 +8.06117 +8.05967 +8.06083 +8.06117 +8.06117 +8.06033 +8.06117 +8.05983 +8.05983 +8.061 +8.05983 +8.05983 +8.06 +8.05983 +8.05967 +8.05983 +8.06017 +8.05967 +8.08617 +8.06 +8.06017 +8.05967 +8.05967 +8.05983 +8.05983 +8.06667 +8.05983 +8.0595 +8.05967 +8.05967 +8.06017 +8.05967 +8.05967 +8.05983 +8.06 +8.0855 +8.0595 +8.0605 +8.0605 +8.06017 +8.06017 +8.05967 +8.06033 +8.06017 +8.06017 +8.0595 +8.05983 +8.06033 +8.06017 +8.06 +8.06 +8.0595 +8.0605 +8.06017 +8.06017 +8.0605 +8.06017 +8.06017 +8.06033 +8.083 +8.06017 +8.0605 +8.06067 +8.061 +8.06167 +8.06067 +8.0605 +8.05983 +8.0755 +8.06017 +8.06067 +8.06067 +8.0645 +8.06 +8.06083 +8.06083 +8.05983 +8.06017 +8.0905 +8.05983 +8.05983 +8.05933 +8.08333 +8.05983 +8.061 +8.06117 +8.06 +8.06033 +8.06117 +8.06033 +8.06783 +8.06117 +8.05983 +8.06017 +8.06 +8.07467 +8.06083 +8.08967 +8.06117 +8.06183 +8.06033 +8.06067 +8.06233 +8.06117 +8.06117 +8.061 +8.06067 +8.061 +8.06217 +8.0605 +8.06217 +8.06233 +8.062 +8.06117 +8.063 +8.0625 +8.06283 +8.06083 +8.06233 +8.06283 +8.0625 +8.0605 +8.06267 +8.06517 +8.06067 +8.06217 +8.0625 +8.0625 +8.06183 +8.06183 +8.062 +8.06417 +8.06017 +8.0625 +8.06417 +8.0605 +8.062 +8.06217 +8.06083 +8.06183 +8.06167 +8.0605 +8.06167 +8.06283 +8.06183 +8.06283 +8.06267 +8.06183 +8.063 +8.06483 +8.06217 +8.0605 +8.06433 +8.06433 +8.06233 +8.071 +8.06233 +8.06217 +8.0645 +8.06267 +8.062 +8.062 +8.06417 +8.06233 +8.061 +8.062 +8.06433 +8.06217 +8.06283 +8.06267 +8.062 +8.06267 +8.0625 +8.0615 +8.06317 +8.062 +8.062 +8.06333 +8.06283 +8.064 +8.06283 +8.06467 +8.063 +8.0625 +8.06283 +8.06333 +8.06117 +8.06117 +8.062 +8.06417 +8.06383 +8.06483 +8.061 +8.06133 +8.06 +8.06383 +8.06483 +8.063 +8.064 +8.06333 +8.06383 +8.06267 +8.063 +8.06467 +8.06417 +8.062 +8.06267 +8.0625 +8.06483 +8.0625 +8.06467 +8.06367 +8.06283 +8.065 +8.061 +8.06467 +8.06433 +8.06383 +8.06233 +8.06383 +8.06483 +8.06217 +8.062 +8.0625 +8.064 +8.06367 +8.0615 +8.06117 +8.06283 +8.06333 +8.06217 +8.06267 +8.06333 +8.06233 +8.06283 +8.06 +8.06267 +8.06233 +8.06217 +8.06383 +8.06117 +8.061 +8.06117 +8.061 +8.0615 +8.06133 +8.074 +8.06117 +8.0615 +8.06133 +8.06117 +8.06133 +8.06117 +8.0615 +8.061 +8.06117 +8.06133 +8.06333 +8.06217 +8.061 +8.06133 +8.061 +8.06133 +8.06033 +8.061 +8.06117 +8.06183 +8.06183 +8.061 +8.061 +8.06117 +8.06117 +8.06283 +8.061 +8.061 +8.05967 +8.06117 +8.061 +8.06083 +8.06117 +8.061 +8.06083 +8.06117 +8.06083 +8.06317 +8.06067 +8.061 +8.06083 +8.06117 +8.061 +8.06083 +8.06033 +8.06067 +8.06067 +8.06567 +8.06117 +8.06117 +8.06067 +8.062 +8.06217 +8.06067 +8.061 +8.0605 +8.0605 +8.061 +8.061 +8.0605 diff --git a/data/samples/bitcoin_f.csv b/data/samples/bitcoin_f.csv new file mode 100644 index 0000000..d640e7f --- /dev/null +++ b/data/samples/bitcoin_f.csv @@ -0,0 +1,1024 @@ +16531.7031 +16537.4956 +16544.9601 +16543.6396 +16542.8817 +16547.9107 +16545.8357 +16549.4906 +16551.1993 +16563.0995 +16563.8972 +16560.4778 +16564.3615 +16565.6272 +16551.2405 +16552.5446 +16548.0945 +16556.7221 +16558.0366 +16561.6265 +16550.6351 +16546.95 +16551.4776 +16547.3148 +16547.8723 +16565.8477 +16568.3587 +16574.7878 +16572.1147 +16567.5381 +16574.5854 +16579.9037 +16582.6803 +16594.7523 +16589.8321 +16590.8261 +16592.3411 +16589.6399 +16609.2654 +16608.2903 +16603.3264 +16589.4006 +16592.5376 +16603.874 +16593.9453 +16583.9874 +16586.6018 +16584.0432 +16588.6592 +16600.2012 +16587.4287 +16584.4582 +16584.1303 +16591.418 +16609.4481 +16605.6801 +16596.2632 +16595.5975 +16581.3198 +16559.4987 +16569.6259 +16573.9412 +16588.8323 +16587.3374 +16583.4666 +16585.1392 +16609.5367 +16655.023 +16630.3277 +16634.0093 +16620.6402 +16627.8939 +16610.463 +16615.6109 +16645.2631 +16648.6134 +16652.5885 +16641.0082 +16635.9456 +16631.6188 +16718.8534 +16723.7465 +16728.7324 +16725.5828 +16738.1655 +16734.6857 +16739.1541 +16712.891 +16721.0109 +16702.9041 +16711.2895 +16718.6674 +16715.2523 +16724.344 +16715.3409 +16707.3145 +16706.4854 +16707.0304 +16722.7848 +16718.5009 +16726.1977 +16715.8675 +16710.6791 +16703.341 +16701.6864 +16701.2241 +16690.0592 +16686.6118 +16695.2296 +16683.1551 +16690.2386 +16684.305 +16696.7689 +16712.1593 +16723.4991 +16716.7431 +16722.9608 +16727.6726 +16727.7138 +16697.5325 +16696.3903 +16696.795 +16688.025 +16690.9633 +16693.9172 +16689.9984 +16698.7131 +16706.5322 +16698.9813 +16700.6716 +16705.7468 +16686.4467 +16691.9785 +16686.4138 +16698.8411 +16702.7047 +16696.3679 +16699.7152 +16706.8603 +16720.0396 +16720.6844 +16690.8246 +16691.7895 +16677.124 +16697.7702 +16690.1233 +16670.7151 +16670.403 +16681.2715 +16654.3279 +16674.8137 +16663.2431 +16663.2957 +16657.7262 +16638.0063 +16648.5753 +16663.2963 +16659.6945 +16674.0624 +16667.5661 +16659.0927 +16665.0543 +16665.3109 +16682.5057 +16679.948 +16702.8058 +16731.6253 +16723.0049 +16738.7658 +16720.51 +16706.5338 +16709.3157 +16696.4403 +16719.9721 +16743.6317 +16753.7885 +16766.2409 +16755.952 +16762.4143 +16780.8941 +16779.0381 +16785.5363 +16730.664 +16721.8249 +16720.7649 +16732.4253 +16728.4975 +16733.9898 +16738.6627 +16724.3514 +16712.2715 +16707.3597 +16698.5876 +16688.1959 +16710.1895 +16691.3775 +16684.5184 +16687.8639 +16699.9279 +16706.2736 +16719.606 +16701.5448 +16645.2006 +16622.405 +16614.1776 +16646.436 +16671.0926 +16665.6739 +16659.4183 +16653.7846 +16619.4061 +16625.7728 +16630.6819 +16617.0537 +16628.3686 +16632.5338 +16643.2481 +16634.0472 +16624.2707 +16623.7069 +16612.5592 +16626.3589 +16636.08 +16647.9345 +16657.5063 +16652.0301 +16659.1584 +16667.6449 +16661.8133 +16669.4675 +16657.0202 +16641.9428 +16652.423 +16653.8995 +16660.8567 +16673.8415 +16661.5296 +16669.3148 +16656.1621 +16646.5265 +16705.4134 +16700.728 +16713.6283 +16717.0321 +16737.0733 +16803.6308 +16806.6033 +16802.2168 +16873.5141 +16845.0158 +16853.0979 +16859.2272 +16833.9091 +16840.9215 +16838.8047 +16834.1099 +16851.7371 +16850.6452 +16847.8787 +16854.5707 +16860.1367 +16840.5048 +16829.9918 +16850.5291 +16846.883 +16831.7656 +16802.5738 +16825.5637 +16849.598 +16835.7747 +16826.7663 +16838.8006 +16833.8942 +16833.7975 +16837.1008 +16829.2859 +16835.8243 +16834.745 +16841.1819 +16841.0214 +16839.4358 +16833.3614 +16830.1541 +16816.2532 +16817.86 +16826.3519 +16815.7444 +16815.2486 +16815.0452 +16799.8843 +16796.9124 +16823.363 +16815.3983 +16815.4027 +16838.6388 +16834.6683 +16848.412 +16867.78 +16856.3162 +16859.8092 +16841.9394 +16854.0306 +16911.1728 +16898.7312 +16885.0761 +16920.0634 +16935.2408 +16953.7357 +16910.3772 +16895.2715 +16806.5081 +16832.5323 +16826.0844 +16831.5133 +16825.6653 +16793.9759 +16798.6513 +16810.108 +16820.265 +16817.3134 +16826.6499 +16816.2855 +16817.2282 +16824.3104 +168249475888010.0 +166884563179570.0 +16858.2362 +16842.3425 +16863.4678 +16829.3507 +16833.9685 +16836.7671 +16839.638 +16842.3612 +16833.289 +16826.1284 +16829.2482 +16828.1071 +16827.5167 +16834.2835 +16831.1525 +16835.4222 +16833.6545 +16846.1135 +16838.3096 +16838.8608 +16829.0237 +16828.0441 +16812.2771 +16810.1928 +16812.8664 +16813.221 +16807.441 +16812.8101 +16818.9274 +16807.661 +16799.6045 +16798.976 +16789.718 +16797.4385 +16793.5193 +16779.8417 +16806.0945 +16817.1611 +16820.2889 +16821.0507 +16820.9282 +16821.1614 +16821.513 +16825.6572 +16830.3126 +16815.924 +16823.7284 +16822.2874 +16823.7366 +16820.5266 +16818.7499 +16808.4625 +16804.7094 +16822.9986 +16819.1781 +16800.7752 +16853.7275 +16848.4081 +16862.719 +16868.7746 +16842.7882 +16832.2705 +16833.7704 +16833.8377 +16827.22 +16818.2604 +16816.9111 +16805.6753 +16795.9179 +16815.4328 +16825.4911 +16827.123 +16828.491 +16830.8596 +16828.1135 +16841.692 +16864.0672 +16852.1071 +16857.0448 +16858.2814 +16858.1368 +16852.9118 +16847.333 +16842.7408 +16847.8651 +16846.0226 +16833.528 +16835.7092 +16822.396 +16823.2505 +16827.6591 +16820.1337 +16857.5993 +16841.8658 +16812.9343 +16821.8162 +16827.8146 +16817.8535 +16826.7053 +16824.1109 +16827.7687 +16827.1475 +16808.7813 +16805.7622 +16803.3504 +16816.4773 +16812.971 +16809.7448 +16805.4557 +16807.529 +16798.1731 +16781.7656 +16788.3204 +16774.7052 +16778.7105 +16787.2804 +16780.7142 +16778.8976 +16792.2987 +16795.9494 +16790.9632 +16788.4096 +16764.2291 +16772.991 +16774.8177 +16783.5984 +16783.4555 +16793.1841 +16785.4385 +16758.7166 +16736.8678 +16724.9964 +16742.2074 +16729.2172 +16725.8529 +16733.1399 +16731.1195 +16702.3044 +16723.6935 +16738.2825 +16756.7058 +16741.8471 +16723.2703 +16746.1834 +16712.0568 +16759.19 +16751.27 +16749.1565 +16786.7341 +16785.1331 +16756.1971 +16756.9742 +16758.0969 +16817.9121 +16806.0716 +16810.7584 +16801.1799 +16824.8727 +16833.8756 +16845.0069 +16835.9829 +16847.0159 +16846.4522 +16858.8148 +16925.4964 +16947.9107 +16972.6507 +16943.3619 +16922.9194 +16901.3975 +16918.8823 +16931.4765 +16922.19 +16934.0012 +16941.7132 +16954.5037 +16944.4464 +16955.4 +16972.2612 +16961.6378 +16957.2926 +16948.3176 +16960.4949 +16954.8025 +16945.3961 +16939.7816 +16939.0907 +16940.3604 +16957.336 +16955.9168 +16945.0734 +16950.0435 +16947.1339 +16943.9462 +16944.311 +16938.7936 +16938.2793 +16938.2006 +16939.1065 +16927.5889 +16929.6986 +16932.3924 +16934.4844 +16932.5042 +16935.5038 +16942.7204 +16947.1802 +16952.5976 +16930.5797 +16935.2637 +16938.4413 +16928.7015 +16927.5671 +16927.7995 +16928.6649 +16929.7567 +16926.3974 +16922.7412 +16922.7001 +16914.9337 +16920.3698 +16922.4497 +16911.4116 +16911.7448 +16912.278 +16919.3804 +16917.4007 +16924.2481 +16931.2469 +16930.3484 +16918.2556 +16916.5603 +16926.4578 +16936.7419 +16936.5303 +16941.3961 +16937.9761 +16935.068 +16936.0109 +16931.796 +16923.4442 +16918.3241 +16918.2665 +16925.5835 +16927.2614 +16928.946 +16929.9396 +16935.2573 +16934.8526 +16936.0953 +16937.8251 +16939.3966 +16936.7512 +16942.8706 +16947.5484 +16940.5759 +16937.6494 +16941.0772 +16945.4936 +16941.97 +16945.4746 +16943.0505 +16940.149 +16935.0435 +16937.2194 +16938.5223 +16939.7859 +16942.1364 +16943.6187 +16944.1693 +16926.1968 +16928.9346 +16932.4748 +16923.8423 +16926.9296 +16928.2702 +16930.3726 +16932.3247 +16936.9025 +16943.1473 +16937.6443 +16937.9359 +16935.359 +16935.4332 +16939.1919 +16937.8732 +16942.6759 +16941.4562 +16938.8909 +16939.6547 +16938.0337 +16948.4477 +16946.2829 +16953.0611 +16947.9855 +16954.1216 +16950.3282 +16947.7591 +16943.842 +16948.6631 +16951.3531 +16962.2798 +16939.0165 +16938.6551 +16934.2459 +16937.5293 +16936.3051 +16938.1817 +16937.2163 +16939.4192 +16939.6151 +16928.5737 +16928.9116 +16927.1545 +16925.8103 +16925.9805 +16935.0158 +16933.7198 +16929.4307 +16936.5007 +16938.9503 +16941.4173 +16949.5405 +16942.4886 +16945.0295 +16945.2058 +16961.1693 +16977.5838 +16965.6695 +16922.8471 +16949.6015 +16936.2933 +16949.3306 +16943.2917 +16933.8434 +16925.7407 +16937.0887 +16936.5277 +16931.494 +16927.384 +16933.9634 +16930.9489 +16936.6004 +16923.4743 +16949.6055 +16951.0284 +16964.9692 +16958.5941 +16963.9283 +16968.2083 +16969.5971 +16958.298 +16954.7393 +16964.1426 +16963.7628 +16971.0548 +17044.0609 +17031.6312 +17049.5259 +17112.4985 +17154.0707 +17156.5116 +17169.8537 +17207.8971 +17191.9356 +17179.1824 +17193.2738 +17197.0665 +17203.3032 +17205.8857 +17209.1392 +17238.2895 +17229.5983 +17216.3968 +17237.0856 +17221.31 +17199.2811 +17202.3601 +17198.0343 +17206.2653 +17190.5573 +17181.8852 +17185.3721 +17202.5844 +17195.567 +17193.1789 +17197.629 +17199.1195 +17207.1477 +17182.638 +17198.1982 +17234.6414 +17236.7034 +17246.5481 +17275.3093 +17273.613 +17273.6892 +17270.8854 +17257.2807 +17262.5747 +17253.3112 +17227.7782 +17233.3449 +17233.037 +17237.2771 +17238.9288 +17243.0313 +17242.0026 +17203.0006 +17201.2334 +17225.9998 +17205.1559 +17226.2168 +17265.4966 +17226.2177 +17246.2659 +17291.8318 +17267.056 +17288.5898 +17307.5898 +17322.5183 +17313.6939 +17372.1263 +17334.9088 +17326.9988 +17367.1842 +17394.3246 +17385.9619 +17364.266 +17344.1075 +17345.0729 +17334.715 +17328.8645 +17276.1155 +17251.1656 +17226.0986 +17210.6535 +17215.9603 +17229.674 +17221.9516 +17188.6979 +17179.1714 +17194.1487 +17216.3647 +17213.4772 +17195.1152 +17176.8839 +17175.4206 +17182.5164 +17189.4086 +17215.7438 +17191.6868 +17202.4101 +17167.0285 +17191.8344 +17188.305 +17198.9345 +17194.1138 +17198.1116 +17203.5826 +17205.5741 +17199.511 +17213.1072 +17217.1236 +17214.8956 +17204.51 +17214.4618 +17215.2554 +17215.7775 +17200.6721 +17192.6754 +17204.5412 +17207.6754 +17203.3643 +17205.6272 +17194.5324 +17234.2837 +17246.2967 +17245.5027 +17254.2346 +17264.8796 +17241.697 +17252.8483 +17245.6111 +17248.55 +17270.6498 +17263.137 +17273.1137 +17275.3784 +17270.427 +17264.5154 +17256.2323 +17240.4457 +17242.8107 +17230.5981 +17233.439 +17245.5213 +17234.034 +17248.8077 +17260.967 +17245.0135 +17270.9354 +17220.2222 +17275.1909 +17320.4174 +17274.8736 +17291.5171 +17333.5051 +17329.2798 +17295.2616 +17305.0948 +17291.5811 +17302.1937 +17304.4391 +17324.9514 +17332.9818 +17396.8589 +17420.9577 +17413.9619 +17414.7855 +17416.7368 +17415.9996 +17434.1793 +17417.5212 +17463.5132 +17444.2155 +17461.8473 +17466.5929 +17463.6307 +17469.8737 +17468.5938 +17451.944 +17450.3253 +17447.5827 +17438.2843 +17423.6061 +17425.3101 +17431.1224 +17435.7808 +17449.4961 +17479.9717 +17458.3626 +17446.9558 +17423.0222 +17416.8011 +17406.5783 +17370.5713 +17404.0214 +17412.2822 +17406.6616 +17411.4106 +17399.2315 +17410.5833 +17399.2091 +17412.8518 +17413.7004 +17424.776 +17396.7229 +17417.9672 +17426.5002 +17421.4426 +17435.3837 +17440.106 +17434.8171 +17443.9719 +17445.5184 +17433.0089 +17434.7095 +17431.2853 +17463.1218 +17454.4649 +17440.7982 +17441.819 +17435.5299 +17443.1972 +17455.8406 +17438.5473 +17435.5692 +17437.8613 +17457.5609 +17435.5947 +17429.5377 +17419.5955 +17422.9699 +17420.5937 +17424.2445 +17419.5279 +17400.934 +17390.5697 +17406.6773 +17392.4852 +17406.3652 +17417.1614 +17413.9774 +17409.3211 +17359.8143 +17334.1842 +17346.0101 +17348.3062 +17359.9184 +17357.146 +17364.1046 +17382.556 +17380.1912 +17380.0171 +17408.0359 +17479.7428 +17479.9341 +17483.0834 +17473.7745 +17515.8855 +17545.6838 +17524.8983 +17537.019 +17534.1287 +17527.9671 +17532.4499 +17564.6715 +17535.9694 +17552.7909 +17557.527 +17570.5057 +17532.9799 +17556.7019 +17570.8662 +17566.0647 +17558.8231 +17868.2198 +17915.7346 +17909.9012 +17932.9659 +18178.2654 +18263.8887 +18244.4011 +18231.3607 +18209.0394 +18201.5374 +18213.4919 +18226.6047 +18213.2347 +18213.4285 +18211.2447 +18217.394 +18212.8521 +18209.8649 +18179.648 +18146.5335 +18102.3259 +18126.0236 +18119.4639 +18152.2141 +18147.9037 +18161.4794 +18159.1848 +18121.4106 +18115.6885 +18120.6383 +18124.3788 +18126.9796 +18137.812 +18142.0949 +18143.8056 +18141.6232 +18140.1305 +18203.6953 +18182.4521 +18183.6501 +18180.3805 +18185.2614 +18184.0594 +18190.796 +18199.5358 +18197.3101 +18221.079 +18260.37 +18218.7669 +18089.4891 +18207.9422 +18279.3643 +18139.6392 +18080.1539 +18035.3858 +18067.8334 +18060.6613 +18094.7547 +18075.5599 +18054.5245 +18111.0509 +18111.4377 +18139.5547 +18208.4581 +18761.1983 +18763.505 +18783.8879 +18715.3852 +18759.6473 +18774.7676 +18851.6759 diff --git a/data/samples/bitcoin_transactions_f.csv b/data/samples/bitcoin_transactions_f.csv new file mode 100644 index 0000000..39902d0 --- /dev/null +++ b/data/samples/bitcoin_transactions_f.csv @@ -0,0 +1,1024 @@ +124.854 +42.9561 +49783.0938 +35374.375 +2218.8999 +13.9804 +14983.0332 +464119.0938 +115.847 +117.7646 +179.45 +60.3492 +58.673 +179.9612 +24.162 +360.123 +88.5039 +25.9549 +133.8232 +274.1797 +155.323 +9497.9814 +166.7534 +663.523 +151.7781 +27.3559 +160.201 +74.0225 +146.774 +269.7193 +269.0989 +22.0159 +155.4561 +23.3588 +3775.665 +3775.2214 +3774.7776 +3774.3337 +3773.8901 +3773.4463 +3773.0024 +3772.1792 +3771.356 +3770.5327 +995.6462 +88.8918 +20.7942 +118.3597 +62.4181 +2815.1677 +719.1459 +324.9144 +500.5838 +43.0511 +117.5493 +18.151 +5873.7393 +84.9053 +48.9915 +929.6197 +89.766 +49.801 +1011.8765 +10001.1367 +1048.4244 +2924.9292 +7218.6343 +218.7281 +9111.1768 +6235.1362 +5146.7065 +4396.2559 +2765.656 +399.347 +294.8017 +1798.1957 +4710.8594 +26983.1973 +15575.1533 +35736.4375 +90892.0938 +35349.5234 +13786.0684 +54957.6641 +54898.3281 +2669.584 +12813.9736 +90842.6484 +12740.6484 +169661.0938 +743.4544 +36.52 +12126.998 +4228.4966 +909.5333 +210.8665 +11378.0576 +297.9277 +199.1867 +249.7496 +14280.7959 +14280.6357 +885.5057 +1444.5505 +188410.6875 +126.3149 +106.6958 +30.3088 +448.4326 +394.0261 +201.047 +18134.5625 +97.1097 +130891.6484 +274.4784 +2002.5168 +777.2323 +5914.2314 +133.9035 +24386.7129 +46.7389 +28999.416 +2008.0969 +176.596 +2614.1575 +6032.4136 +5618.0483 +892.4868 +13975.4736 +637.486 +75323.3906 +101.7215 +101.7778 +673.7308 +55.5608 +10.096 +640.8458 +221.5949 +42.673 +221.89 +3098.0935 +210.8199 +40.4949 +2799.4331 +2681.3943 +258.5764 +84.6231 +22.9527 +9.6083 +47.9282 +1331.0525 +30322.5176 +13.6174 +411.7817 +179.9306 +7.6117 +30290.9082 +3.7109 +341.4075 +404.3426 +203.8082 +4.6845 +19.7921 +28.1876 +34.8079 +16.6129 +336512.0625 +112104.2031 +9489.2705 +54.249 +30.9763 +493.4097 +44.5524 +149.7758 +100.4895 +1029.5696 +984.3409 +903.647 +60.4721 +947.6882 +49.2915 +38.9532 +32.5801 +48.9267 +46.699 +46.2552 +45.8114 +45.3676 +10.2069 +92.4585 +1559.3737 +2472.7017 +705.1078 +0.6164 +1.0025 +165.0879 +9.5608 +9.117 +8.6732 +20.3251 +3.9403 +9.7632 +5.5761 +5.1323 +6871.3545 +979.5654 +949.2224 +40568.5312 +32654.3691 +21.4204 +23.9632 +6556.145 +286.7888 +678.3151 +53620.3711 +39354.0078 +5315.3516 +4270.1475 +784.6124 +16.9795 +16.3134 +226.1636 +90937.1016 +33683.6836 +75.2726 +559.4166 +235.7448 +44616.8438 +22.5755 +23.9641 +5538.3418 +300.0508 +32.1363 +31.6925 +31.2488 +30.805 +30.3612 +47.3922 +9.9882 +426.205 +341.7106 +20.03 +3.1065 +192.7115 +13.4798 +18.5833 +18.4346 +18.286 +17.8102 +123.018 +89.3294 +83.7005 +260.9608 +922.7389 +6.95 +6.5329 +4.9943 +148.0827 +2528.5857 +498.9063 +3.2995 +244.7429 +26207.752 +540.9935 +166.4424 +46289.5039 +49.9253 +300.5531 +226.1707 +35.9302 +281.0388 +25.2626 +23.0961 +6.6669 +50.4019 +66.2515 +70.3853 +83.262 +1.6464 +417.0019 +71.4938 +3.0008 +47.6251 +28.0114 +2196.126 +14.9776 +2607.6748 +4.9814 +67.3485 +451.3877 +2.2189 +1.2026 +1.1094 +2.2189 +18.1297 +17635.6855 +17369.0664 +314.0209 +299.496 +210.7023 +23.6526 +125.9124 +249.214 +200.056 +1352.9525 +2.2189 +381.0983 +76.3994 +261.5519 +286.4578 +18.9951 +102.7315 +101.5338 +24.4399 +34.3934 +416.1032 +83.1755 +18.088 +3.5844 +294.886 +1032.0112 +19.9816 +2.9951 +37916.25 +1.464 +21.9449 +21.9449 +21.9449 +21.9449 +21.9449 +21.9449 +21.9449 +21.9449 +21.9449 +21.9449 +21.9449 +21.9449 +21.9449 +21.9449 +21.9449 +85.8794 +2.5295 +21.9449 +21.9449 +21.9449 +21.9449 +21.9449 +21.9449 +21.9449 +21.9449 +21.9449 +21.9449 +21.9449 +21.9449 +21.9449 +21.9449 +21.9449 +21.9449 +21.9449 +21.9449 +21.9449 +21.9449 +21.9449 +21.9449 +21.9449 +21.9449 +21.9449 +21.9449 +21.9449 +21.9449 +21.9449 +21.9449 +21.9449 +21.9449 +21.9449 +21.9449 +21.9449 +21.9449 +21.9449 +21.9449 +21.9449 +21.9449 +21.9449 +21.9449 +21.9449 +21.9449 +21.9449 +21.9449 +21.9449 +21.9449 +21.9449 +21.9449 +21.9449 +21.9449 +21.9449 +21.9449 +21.9449 +21.9449 +21.9449 +21.9449 +21.9449 +6959.9873 +957.1664 +102.8118 +84.8312 +133.0177 +39.6406 +4.2971 +146.0462 +0.5893 +204.6935 +84.1638 +44595.6992 +27036.1133 +28.7525 +2059.969 +75342.3828 +45182.8008 +16.762 +146.9759 +83.9521 +51.3702 +67.3591 +317.8658 +258.4903 +3.8023 +15.2727 +214.644 +808.2263 +169.768 +1960.6577 +78.4869 +12.7804 +108.9356 +95.2995 +210.2519 +2534.5598 +197.037 +99.9029 +96.8741 +165.4092 +7.4693 +25.3012 +12.022 +25.6718 +10.3538 +96.8741 +78.9103 +104.741 +2490.3801 +71.1468 +48.1364 +180.0273 +17.3926 +239.1526 +1636.8741 +5.3236 +1.8492 +14.4237 +10.3134 +4264.9336 +146.715 +1.1099 +144.7251 +66.5839 +47.1902 +212.6625 +4635.4941 +1517.5958 +170.3858 +48.4497 +212.1987 +4851.877 +4441.4546 +135.1874 +1771.2582 +2.2185 +119.4545 +4.7107 +80.9437 +210.7387 +5719.6401 +1.6313 +182.4362 +798.1547 +129.5651 +1305.6292 +1398.4871 +97.8615 +67.2868 +44.2835 +1116.9552 +3701.6284 +83.4608 +77.4529 +317.7851 +3367.0525 +47.8044 +563.428 +579.9454 +119.9915 +1006.0674 +105.0081 +60.1744 +4.3979 +753.8256 +449.7533 +322.3241 +456.9891 +423.2951 +2.8389 +1011.5681 +99.6566 +15.2265 +72.5318 +9498.6309 +528.2846 +1597.3315 +156.874 +131.7494 +7780.9048 +60.8001 +509.897 +173.9196 +150.1902 +468.9556 +136.2045 +989.1439 +986.5003 +5.1687 +92.2916 +1450.5046 +3681.2061 +329.6007 +199.2035 +110.531 +68.7859 +2091.6558 +1082.5485 +94.9787 +455.1221 +2111.0718 +14.5116 +90.1459 +302.9034 +120.2821 +16.4434 +3487.2815 +56.4062 +80.728 +104.0061 +68.1939 +2273.2883 +2040.9242 +1832.6255 +1588.5677 +1268.2069 +3130.8337 +66.3766 +267.6628 +14.1304 +829.4634 +167.4755 +22068.377 +2907.5244 +68.6838 +1567.8726 +2268.9299 +2228.7759 +2188.6218 +2148.4678 +2108.3135 +2068.1594 +2028.0027 +3655.3994 +199.8683 +1008.6556 +73.352 +49.9057 +525.1368 +399.2906 +22.7011 +20.8652 +536.7079 +169.8905 +27.3577 +290.2419 +208.1337 +1987.8486 +1947.6945 +126.0482 +824.2766 +17.208 +458.9608 +720.6606 +80.0384 +1006.9359 +2702.3606 +2642.239 +2622.0728 +886.7324 +251.5021 +183.7054 +133.5494 +2607.1165 +148.909 +24.4221 +1016.376 +37.3862 +280.1073 +1994.9673 +799.949 +1165.4692 +209.9678 +1827.5038 +461.0577 +214.4576 +157.0018 +360.1204 +43.095 +475.7979 +1332.3195 +284.5562 +1356.726 +170.9223 +303.1741 +210.9251 +1951.3322 +459.1969 +119.1935 +72.487 +128.4042 +54.8778 +293.8361 +440.3239 +149.7598 +296.7885 +263.8134 +13.5757 +81.7403 +32.2468 +286.5341 +99.8691 +49.7136 +207.9473 +1.7747 +120.5879 +47.366 +31.3189 +44377.8125 +886.6387 +268.21 +6582.0059 +106.3337 +2059.489 +7521.9258 +261.5187 +1812.8604 +23.7147 +38.1726 +32.3436 +38.0222 +299.8306 +49.7375 +495.4981 +32.5517 +2012.6542 +2031.8857 +3019.7322 +4999.3354 +116.415 +17934.3828 +808.2765 +79.8662 +208.2904 +344.1638 +30.1975 +1.9966 +50.3965 +1.7751 +7154.5796 +1157.9818 +487.0889 +294.3273 +91.2611 +604.834 +396.1704 +1926.465 +2.9587 +87.8596 +80.5616 +2830.668 +106.8072 +2525.1841 +4002.592 +1260.2074 +13807.6924 +449.5536 +408.5155 +10183.084 +981.3063 +503.5558 +1137.0479 +1.2137 +681.7397 +5.3156 +501.1803 +1000.9161 +124.2664 +614.0415 +102.2145 +1659.7798 +324.6082 +209.0328 +33.8635 +79.7845 +83933.8359 +251.6707 +2.222 +29.2637 +94.4874 +208.7732 +1006.8401 +419.3433 +46.6364 +971.8258 +122.3182 +1065.3228 +586.4233 +402.6274 +4.1635 +3.3696 +191.6499 +1758.1073 +50.5763 +396.2503 +61.2572 +1340.9514 +10.1528 +24.7012 +30.6829 +14.3767 +334.71 +3.1078 +120.5706 +165.9502 +2169.4944 +1493.6685 +687.0193 +33.41 +49.3031 +234.5222 +132.5473 +201.0341 +125.0417 +652.575 +159.72 +161.2173 +17.887 +1697.2783 +27.9808 +26089.2598 +24.3347 +22.6971 +30.1446 +116.984 +297.3703 +212.414 +398.9493 +20.231 +86.0694 +217.6368 +330.834 +592.7574 +238.4545 +280.7694 +18.4985 +322.6462 +441.0401 +485.502 +243.8802 +1113.452 +202.2186 +782.8421 +456.1956 +109.4934 +1920.6665 +18.7497 +114.5645 +197.5859 +544.6676 +133.1704 +5.0081 +1914.083 +105.2202 +4576.7944 +72.2842 +2.7164 +1.0677 +1.0828 +4755.6328 +2784.7986 +343.4542 +103.5374 +77.5883 +135.4164 +299.8031 +835.3524 +215.7126 +2021.9722 +928.5169 +88.4223 +275.5829 +2387.2856 +55.5364 +196.0243 +720.4959 +47.1352 +97.3423 +269.3217 +309.9794 +6.4384 +111.8565 +400.8869 +51.4421 +118.211 +1.4951 +46.446 +1178.3921 +1639.1769 +51.2104 +441.7581 +179.2001 +113.0521 +73.1962 +32.2211 +20.3664 +215.3101 +12.9233 +10.7683 +12.3539 +17.2258 +1084.2655 +13.5894 +4585.3682 +10094.0508 +199.7924 +397.3593 +360.1665 +42.9379 +1019.0627 +756.4354 +28.2852 +179719.0156 +3328.6096 +6.5253 +300.1866 +399.2982 +181.0516 +47.5546 +731.9712 +81.2699 +49.8507 +15.7409 +163.4371 +1880.7281 +209.0 +478.7729 +247.315 +272.5222 +64.5789 +192.1656 +127.581 +106.3621 +99.8372 +67.2651 +1314.5913 +1.1094 +2407.3489 +1291.9723 +3698.7031 +1205.6487 +1.1094 +24.8277 +111.7114 +13183.9941 +49.868 +221.6073 +695.3771 +301.6213 +106.9745 +2958.7981 +745.9906 +73.4207 +500.7955 +450.6391 +25.4747 +727.6501 +839.0162 +1244.147 +264.3198 +365.4253 +547.3929 +2666.312 +157.2907 +132.1568 +130.0249 +44.54 +76.9306 +83348.1016 +148.8651 +1598.8573 +80.1351 +84.7269 +7372.7593 +28.0145 +214.8605 +485.7234 +244.367 +244.261 +996.3655 +506.384 +194.3202 +7102.4956 +534.4216 +110.002 +74.7405 +297.2904 +422.5549 +8554.7832 +90.8222 +566.0427 +50.2865 +39.4463 +315.3438 +1319.1494 +178.5913 +698.9366 +1067.8634 +1658.9562 +1913.46 +29.3978 +791.8868 +229.8483 +404.359 +101.3079 +38.3954 +1898.4988 +317.2734 +143.352 +29.7306 +4.4724 +194.9703 +362.6734 +40.6951 +620.5242 +863.2617 +1252.0956 +828.3748 +422.112 +27.9892 +255.1868 +49.9355 +90.6017 +126.1125 +689.0989 +909.7899 +899.6539 +33.583 +604.4133 +99.8372 +28.6859 +31581.6465 +1.8648 +64.6179 +73.4966 +10.3427 +141.9386 +111.3719 +9242.6836 +788.5847 +117.8369 +488.0116 +100.9653 +178.2269 +19.9515 +362.6521 +83.9365 +543.35 +5.032 +478.1175 +191.5914 +580.5898 +251.5301 +564.5814 +75.3725 +67.238 +98.876 +421.4445 +154.8974 +36.1836 +57.3488 +469.2481 +1387.0432 +52.3407 +1346.369 +86.9818 +76.8361 +453.6062 +303.6653 +130.0812 +400.8483 +370.7489 +52.1623 +100.0205 +79.5489 +1956.1392 +400.0561 +251.7307 +14.1371 +3883.5925 +66.6531 +80.0233 +97.9973 diff --git a/data/samples/city_temperature_f.csv b/data/samples/city_temperature_f.csv new file mode 100644 index 0000000..5b7c59b --- /dev/null +++ b/data/samples/city_temperature_f.csv @@ -0,0 +1,1024 @@ +64.2 +49.4 +48.8 +46.4 +47.9 +48.7 +48.9 +49.1 +49.0 +51.9 +51.7 +51.3 +47.0 +46.9 +47.5 +45.9 +44.5 +50.7 +54.0 +52.6 +54.2 +51.0 +53.5 +54.2 +54.2 +52.6 +55.5 +53.8 +54.3 +57.4 +56.9 +50.4 +50.1 +54.1 +49.1 +48.8 +50.7 +51.6 +52.6 +56.3 +59.0 +59.4 +55.5 +57.0 +60.8 +61.8 +57.7 +56.1 +53.4 +51.4 +52.6 +52.5 +57.5 +55.1 +54.3 +63.0 +60.0 +48.3 +55.3 +52.2 +56.6 +54.7 +51.9 +54.5 +58.5 +53.4 +51.8 +53.3 +65.6 +68.7 +58.4 +55.1 +52.8 +53.9 +54.8 +55.0 +52.8 +56.1 +56.5 +56.7 +51.4 +51.6 +53.3 +56.4 +54.7 +54.5 +53.4 +56.6 +53.2 +46.6 +47.4 +52.0 +62.2 +64.2 +59.5 +59.0 +54.9 +54.2 +57.8 +60.0 +61.1 +56.2 +56.1 +54.6 +54.5 +52.0 +56.6 +60.4 +62.7 +61.0 +56.5 +56.0 +53.1 +51.1 +57.2 +56.3 +56.5 +60.8 +60.4 +61.5 +60.4 +59.7 +61.5 +63.9 +63.7 +66.7 +75.1 +77.2 +74.6 +69.4 +67.4 +68.2 +63.3 +60.4 +65.6 +68.9 +68.4 +70.1 +70.3 +68.0 +62.6 +64.3 +72.0 +73.9 +68.2 +67.7 +67.2 +67.7 +68.3 +68.2 +66.8 +64.6 +66.7 +67.7 +68.6 +69.3 +70.4 +72.6 +70.6 +70.5 +69.1 +68.6 +70.3 +69.2 +69.2 +71.5 +71.4 +71.9 +70.0 +76.5 +73.4 +75.9 +74.5 +73.8 +72.8 +74.6 +75.8 +75.8 +75.1 +73.6 +77.7 +78.6 +77.4 +76.2 +73.0 +73.9 +72.9 +74.1 +76.6 +75.9 +76.7 +76.1 +75.5 +74.6 +76.7 +76.5 +76.4 +76.6 +77.7 +78.3 +76.9 +83.4 +81.6 +77.2 +80.4 +79.3 +81.1 +80.7 +80.3 +79.4 +78.8 +79.7 +80.1 +79.9 +77.6 +78.6 +78.5 +78.3 +78.4 +78.9 +-99.0 +-99.0 +85.7 +82.1 +79.5 +81.0 +78.9 +77.6 +79.4 +79.8 +79.0 +77.5 +78.3 +76.2 +77.4 +76.0 +74.9 +73.8 +75.6 +75.8 +76.0 +73.0 +72.9 +76.3 +79.6 +84.2 +76.0 +76.0 +76.0 +75.9 +75.7 +70.6 +73.6 +75.1 +75.2 +72.5 +69.4 +69.9 +71.0 +72.0 +74.0 +71.9 +67.9 +67.8 +69.3 +68.6 +66.0 +65.3 +62.6 +65.2 +66.7 +67.0 +70.6 +69.0 +69.0 +74.0 +72.9 +73.3 +73.6 +71.1 +72.4 +70.6 +69.8 +67.8 +66.0 +71.6 +68.9 +69.1 +66.0 +67.3 +69.5 +65.4 +63.0 +62.2 +63.2 +63.4 +62.4 +67.4 +66.4 +66.9 +68.9 +70.3 +71.6 +71.3 +67.9 +69.9 +65.3 +60.6 +61.6 +55.0 +52.5 +57.2 +58.3 +74.0 +74.8 +68.6 +59.9 +58.4 +59.5 +64.3 +66.8 +71.7 +65.7 +59.0 +63.5 +60.5 +56.4 +56.4 +60.5 +59.3 +56.8 +52.6 +56.6 +57.5 +57.3 +55.9 +55.1 +57.7 +53.8 +51.8 +59.3 +58.2 +53.2 +52.6 +52.9 +52.1 +54.4 +52.3 +62.7 +62.2 +55.6 +54.1 +52.1 +58.1 +57.0 +55.3 +60.8 +67.0 +66.9 +65.7 +64.2 +63.9 +57.3 +64.3 +66.1 +67.4 +60.0 +54.4 +57.7 +57.6 +62.2 +59.8 +56.8 +64.5 +56.7 +53.5 +54.3 +54.1 +53.3 +59.4 +53.2 +56.8 +55.3 +53.3 +55.8 +58.6 +66.0 +61.1 +56.8 +51.9 +52.1 +58.6 +62.8 +56.3 +54.2 +54.0 +60.6 +58.3 +54.7 +56.0 +51.8 +55.8 +55.9 +55.8 +48.9 +54.6 +51.0 +51.4 +-99.0 +-99.0 +49.4 +48.4 +51.2 +48.5 +54.4 +48.5 +47.0 +42.3 +45.9 +45.9 +54.4 +52.5 +49.0 +50.2 +51.7 +48.6 +49.5 +51.6 +50.3 +50.7 +57.7 +56.5 +55.3 +59.8 +55.4 +53.3 +51.4 +53.8 +52.4 +50.9 +51.0 +49.4 +50.1 +50.4 +56.1 +57.2 +60.4 +65.5 +72.4 +64.2 +61.5 +60.2 +60.3 +58.9 +63.6 +62.4 +60.5 +61.6 +55.4 +50.6 +53.2 +55.1 +55.0 +56.9 +56.3 +55.6 +59.5 +59.1 +61.0 +61.2 +57.9 +58.9 +61.3 +54.0 +60.4 +74.1 +68.0 +68.6 +64.7 +59.0 +58.2 +59.4 +58.8 +57.7 +58.6 +60.5 +60.7 +62.6 +63.3 +63.2 +62.5 +66.4 +63.1 +64.9 +64.6 +62.2 +61.8 +61.1 +60.5 +61.0 +63.6 +64.4 +69.0 +67.0 +64.7 +61.8 +60.6 +61.0 +62.4 +64.6 +66.2 +66.3 +67.0 +67.5 +66.3 +66.0 +66.1 +68.0 +67.1 +65.0 +63.8 +65.8 +67.6 +68.0 +71.6 +71.6 +72.0 +72.3 +73.0 +72.7 +72.6 +71.6 +72.7 +74.1 +72.6 +74.0 +78.9 +81.4 +74.4 +69.6 +65.2 +64.7 +67.0 +67.7 +68.8 +72.2 +71.5 +72.8 +77.4 +73.7 +74.4 +78.5 +78.8 +79.7 +74.3 +69.3 +71.2 +71.6 +71.0 +69.5 +70.6 +72.0 +76.0 +74.0 +73.6 +75.7 +77.1 +76.0 +75.6 +82.5 +87.9 +88.1 +81.2 +78.0 +75.1 +77.7 +79.2 +78.6 +76.0 +76.8 +77.9 +79.0 +77.9 +79.0 +81.6 +80.5 +77.8 +77.7 +81.1 +77.8 +77.3 +79.2 +80.2 +79.5 +77.3 +73.6 +75.2 +78.1 +77.6 +74.0 +76.2 +75.6 +77.7 +86.4 +82.4 +78.4 +76.1 +75.4 +75.0 +72.4 +71.0 +68.0 +70.1 +71.9 +73.9 +76.2 +77.1 +73.0 +68.9 +76.4 +75.4 +72.3 +69.4 +69.5 +69.7 +69.8 +72.5 +72.1 +68.2 +72.8 +68.8 +67.5 +65.9 +69.1 +69.6 +65.7 +66.4 +66.4 +65.5 +67.4 +67.9 +63.4 +63.1 +60.8 +57.3 +57.9 +61.7 +61.9 +61.2 +62.2 +62.0 +64.4 +67.4 +64.5 +62.3 +61.9 +65.5 +63.7 +63.7 +60.2 +60.7 +61.1 +61.8 +61.4 +60.1 +61.1 +61.3 +60.1 +63.8 +57.0 +56.9 +55.3 +55.1 +56.5 +58.8 +59.9 +54.7 +55.6 +56.3 +57.1 +65.7 +76.9 +72.1 +61.0 +60.6 +59.1 +57.9 +56.8 +54.6 +62.5 +61.8 +61.4 +57.8 +57.1 +56.1 +62.7 +63.4 +59.8 +52.9 +51.1 +59.7 +56.0 +52.4 +52.1 +60.4 +52.1 +50.5 +48.1 +48.9 +52.7 +51.2 +53.7 +57.1 +64.1 +59.5 +57.2 +54.3 +56.6 +59.8 +58.4 +66.9 +71.1 +58.9 +59.3 +58.7 +53.2 +49.2 +43.9 +55.2 +51.6 +56.6 +57.7 +59.0 +57.7 +53.7 +49.3 +52.8 +48.6 +50.0 +58.2 +55.3 +50.2 +48.2 +49.5 +54.6 +58.7 +60.1 +52.8 +53.9 +57.2 +67.3 +67.9 +65.0 +62.6 +54.3 +53.7 +52.7 +53.8 +50.1 +59.7 +57.7 +55.0 +55.4 +56.2 +54.0 +54.0 +52.7 +49.8 +49.5 +49.6 +51.0 +51.4 +51.0 +52.4 +56.0 +59.0 +57.4 +52.5 +51.4 +53.9 +50.7 +50.8 +51.9 +53.4 +58.3 +56.3 +54.6 +55.8 +54.2 +49.9 +50.2 +52.0 +52.3 +52.0 +53.6 +50.7 +55.6 +53.4 +52.9 +54.0 +49.9 +54.8 +50.6 +52.2 +52.3 +57.0 +53.5 +54.0 +56.1 +54.6 +54.2 +53.6 +53.0 +54.3 +54.5 +54.9 +54.6 +58.1 +55.1 +52.3 +52.9 +52.3 +54.0 +53.9 +57.2 +56.6 +56.1 +62.3 +63.1 +62.1 +60.7 +60.3 +55.4 +58.0 +62.7 +60.1 +62.0 +59.6 +59.0 +61.5 +68.9 +58.1 +58.1 +57.8 +59.1 +59.0 +59.4 +66.9 +67.7 +67.5 +61.8 +59.4 +60.6 +63.4 +65.4 +69.7 +70.9 +63.0 +59.3 +60.4 +64.7 +65.8 +65.8 +68.0 +66.2 +65.2 +65.5 +63.6 +67.1 +68.8 +71.2 +66.3 +67.2 +71.1 +76.4 +82.7 +72.9 +70.8 +72.2 +70.2 +72.6 +70.0 +70.7 +71.1 +69.6 +74.2 +70.9 +73.8 +75.1 +75.3 +75.0 +77.4 +75.4 +74.0 +76.1 +77.2 +76.1 +75.0 +77.8 +73.6 +72.7 +74.8 +77.9 +75.5 +72.4 +73.8 +73.3 +73.7 +75.8 +71.9 +70.2 +68.0 +71.7 +73.7 +73.0 +71.6 +71.8 +73.4 +73.2 +71.6 +71.9 +72.8 +73.9 +74.7 +80.6 +79.1 +77.5 +77.4 +77.0 +77.1 +77.4 +75.5 +75.8 +77.9 +76.6 +74.3 +75.1 +75.7 +76.8 +76.1 +75.0 +76.2 +77.4 +77.4 +77.1 +80.2 +79.7 +78.0 +76.7 +81.4 +78.5 +84.3 +86.8 +80.2 +75.2 +76.0 +75.9 +78.3 +78.5 +78.6 +77.5 +76.6 +78.3 +78.0 +77.0 +76.3 +75.4 +79.4 +77.3 +77.8 +78.4 +76.0 +74.5 +78.3 +81.7 +79.1 +76.6 +70.9 +73.9 +75.5 +75.3 +73.5 +75.2 +76.4 +79.2 +81.4 +81.5 +76.2 +75.9 +75.0 +75.7 +76.1 +76.2 +74.8 +72.6 +72.1 +70.8 +70.0 +71.3 +70.7 +73.4 +77.4 +83.9 +77.2 +74.9 +73.8 +73.7 +73.8 +74.6 +81.2 +73.7 +72.0 +71.2 +71.5 +72.5 +75.5 +67.2 +63.3 +62.4 +62.6 +65.4 +73.3 +68.8 +71.9 diff --git a/data/samples/cms1.csv b/data/samples/cms1.csv new file mode 100644 index 0000000..f9dac55 --- /dev/null +++ b/data/samples/cms1.csv @@ -0,0 +1,1024 @@ +19.7 +19.7 +27.21 +27.21 +11.31 +20.34 +87.8 +48.06 +91.66 +134.56 +39.47 +39.47 +39.47 +39.47 +60.94 +60.94 +57.1 +56.19 +56.19 +56.081993958 +56.07 +143.85 +106.61 +157.18 +157.18 +156.83955307 +157.18 +59.15 +33.227942238 +189.23215517 +176.48 +210.64 +232.66916667 +6.55 +9.57 +32.93 +30.39 +30.39 +60.67 +55.971580817 +56.0 +55.806756757 +56.0 +80.67 +80.67 +53.47 +70.328338164 +30.76 +9.61 +17.29 +78.94 +78.94 +78.94 +78.94 +21.51 +21.51 +71.71 +71.71 +71.71 +71.71 +71.71 +112.55 +112.55 +112.55 +112.55 +112.55 +71.26 +76.201666667 +76.917894737 +222.12833333 +255.534 +181.64818182 +190.705625 +153.41583333 +150.10727273 +162.16727273 +180.48357143 +143.1765 +144.37578947 +139.57384615 +164.76363636 +157.15714286 +134.70611111 +134.17703704 +158.79 +168.96642857 +148.4825 +977.36666667 +1007.4645161 +748.75642857 +744.96571429 +721.73 +187.90333333 +119.30603774 +160.20323077 +133.00805369 +141.70496032 +114.33957447 +108.93799107 +111.9305298 +98.4925 +89.020243902 +101.18384615 +195.94588235 +95.798 +47.274166667 +208.16 +208.16 +27.59 +496.03076923 +127.3 +233.98 +885.73 +716.13 +135.59272727 +84.49 +13.35 +11.7 +104.8 +200.07 +70.09 +101.2 +716.13 +316.34 +47.981818182 +61.84 +81.26 +119.23 +19.62 +13.02 +252.13090909 +641.14428571 +309.614 +861.61 +696.67 +696.67 +343.13 +360.904375 +80.24 +1243.4972222 +54.95 +289.77 +31.33 +31.65 +196.16666667 +23.03 +31.89 +31.89 +31.7 +24.18 +24.090922509 +21.78 +28.251880878 +33.64 +31.267809524 +32.802631579 +33.64 +33.50982659 +27.367109091 +33.64 +69.98 +51.443333333 +23.35 +25.43 +105.1 +103.18222222 +48.5525 +29.529090909 +54.434705882 +54.498187919 +54.596428571 +83.581287129 +83.247037037 +83.226352941 +134.43 +116.59 +23.645333333 +22.991153846 +14.43 +28.95 +101.7 +101.7 +155.0 +68.42 +64.651111111 +96.299893048 +126.56533333 +155.0 +152.67 +155.0 +17.21 +16.95 +42.98 +42.34 +42.171666667 +34.14 +151.92454545 +84.0095 +27.010540541 +80.086076923 +127.66045455 +32.5952 +92.678823529 +60.155882353 +59.0 +123.31125 +115.73493671 +124.09285714 +135.60464 +132.31758865 +113.9765625 +116.8467803 +119.17782609 +130.3475 +114.90185185 +134.01973451 +125.92954733 +143.05545455 +145.45666667 +76.154864865 +146.9525 +56.942307692 +85.5628 +76.306153846 +99.922608696 +94.08 +137.84846154 +127.57 +76.995555556 +134.1965 +107.79168378 +118.06507692 +131.85791209 +133.19310345 +76.046521739 +114.26391304 +139.1317757 +137.14821429 +100.51278689 +100.97554422 +112.03135993 +143.48409091 +137.04 +132.20744898 +105.0704 +122.64672727 +102.32065657 +210.73333333 +201.13363636 +172.77521739 +162.05533333 +156.72842105 +102.05 +150.3325 +153.24183673 +67.687368421 +242.87916667 +275.40066667 +255.82965517 +325.02958333 +224.36357143 +330.1325 +274.685 +261.045 +241.31882353 +228.96363636 +257.6775 +261.00823529 +212.70214286 +200.20454545 +221.79083333 +267.37166667 +235.64363636 +152.11875 +277.26076923 +289.41466667 +250.5325 +264.4125 +250.60642857 +231.825 +269.85071429 +257.28647059 +254.38454545 +254.06 +195.38 +247.14636364 +189.61541667 +473.88454545 +302.81090909 +319.4175 +333.26642857 +262.35909091 +287.5275 +301.80173913 +241.17 +261.24857143 +333.42636364 +331.66583333 +232.44833333 +286.97727273 +356.52727273 +263.60384615 +196.6525 +286.52461538 +313.21615385 +283.188125 +274.856 +305.83615385 +369.42133333 +369.82933333 +331.60647059 +356.72277778 +279.73764706 +298.369375 +308.66230769 +288.91611111 +296.16052632 +266.66107143 +257.4326087 +251.86 +243.134375 +362.20076923 +303.72095238 +268.98666667 +263.11916667 +295.31692308 +298.9045 +304.54642857 +301.81375 +333.04411765 +373.1152381 +317.91047619 +281.52233333 +260.14594595 +321.63823529 +183.40142857 +348.77653846 +311.8655 +303.25916667 +277.88923077 +291.28071429 +313.55 +318.1047619 +343.075 +402.24833333 +250.20294118 +227.165 +314.824375 +172.11769231 +255.855 +319.27142857 +348.46733333 +344.50052632 +337.73210526 +317.744 +341.0175 +272.56 +334.16909091 +293.86857143 +294.34285714 +294.50529412 +322.22375 +290.89142857 +224.29631579 +291.57142857 +290.92076923 +273.94722222 +337.03733333 +344.49142857 +465.78642857 +387.79071429 +367.85153846 +404.27692308 +365.34 +468.62578947 +527.41882353 +463.64833333 +613.66133333 +456.33333333 +599.25285714 +652.39285714 +529.87538462 +574.0325 +612.18777778 +566.37833333 +629.30916667 +516.43484848 +576.86 +585.0875 +545.66941176 +663.89444444 +574.82090909 +584.68647059 +445.815 +452.30833333 +471.74266667 +559.07769231 +195.80875 +198.85357143 +180.16454545 +219.4075 +187.21454545 +183.25357143 +159.98307692 +167.63533333 +128.72 +92.442 +176.891 +146.76636364 +89.95 +150.75465753 +181.65 +84.139166667 +74.4425 +75.538461538 +73.102631579 +76.011538462 +69.662857143 +173.07857143 +205.54666667 +64.950909091 +66.233076923 +74.508125 +161.85681818 +128.6375 +74.548181818 +60.648333333 +70.245882353 +162.36 +152.46875 +153.85789474 +68.405333333 +96.819090909 +88.153333333 +77.666923077 +81.609230769 +80.772666667 +92.6375 +79.12 +89.933157895 +198.26785714 +208.885 +230.995 +142.38181818 +115.65416667 +68.738823529 +66.518571429 +70.497096774 +75.947272727 +68.092142857 +69.630714286 +160.21307692 +109.16636364 +72.686363636 +158.53333333 +85.450909091 +74.224705882 +135.3 +63.475 +64.420769231 +66.145882353 +85.737142857 +81.930714286 +81.350909091 +71.889333333 +84.475294118 +65.603636364 +64.391818182 +69.631428571 +65.126923077 +63.47884058 +70.096923077 +70.448181818 +67.586 +51.424166667 +56.962857143 +57.220714286 +54.612222222 +173.50454545 +52.424285714 +50.256388889 +54.403676471 +104.86538462 +223.03454545 +394.33214286 +387.62083333 +112.9075 +95.955555556 +94.430833333 +137.05291667 +102.84384615 +87.56369863 +115.64869565 +112.48263158 +126.83478261 +110.39466667 +101.44888889 +115.88324324 +113.42684211 +119.68860465 +114.25452055 +113.985 +118.76922078 +117.70833333 +124.506 +238.846 +193.725 +201.53076923 +214.09274194 +188.41363636 +103.11714286 +114.481875 +113.77818182 +109.28538462 +116.17 +112.48315789 +109.20538462 +202.02898551 +30.609756098 +59.0 +30.694444444 +79.0 +78.5 +45.0 +44.8 +59.0 +29.705882353 +31.341463415 +79.0 +45.0 +45.227272727 +9.8448275862 +26.5764 +25.43 +35.22 +67.39 +18.81 +98.27 +23.272439024 +49.08 +16.95 +241.83 +177.06 +39.41 +39.41 +23.15 +21.87 +21.87 +22.52 +35.032307692 +25.68 +111.9 +167.77 +160.95 +13.03 +191.9 +26.31 +27.56 +27.56 +26.31 +22.22 +25.42 +27.56 +26.31 +27.56 +28.74 +28.74 +25.417931034 +25.42 +25.42 +25.42 +26.535769231 +35.01 +37.49 +35.01 +35.01 +39.194249292 +39.19 +39.29 +39.19 +37.49 +35.01 +39.19 +39.063602484 +39.19 +39.29 +38.359583333 +34.38375 +35.01 +36.143127962 +35.0 +115.42263158 +109.74 +201.91 +205.82 +235.23571429 +242.31608696 +225.84714286 +287.55 +256.24470588 +228.58836735 +318.78153846 +283.86875 +31.13 +71.92 +105.42 +11.272941176 +5.3845714286 +123.13 +128.77 +111.9 +129.49 +167.77 +54.3715 +37.87 +48.29 +43.6 +78.93 +82.47 +71.92 +82.47 +20.51 +115.38 +113.44642857 +120.43 +102.3426087 +120.43 +121.33 +153.53 +151.23 +160.06 +160.06 +161.16 +152.68 +24.706962025 +183.44 +160.95 +196.13 +20.33 +19.799813665 +17.99 +54.9 +37.87 +43.82 +43.020909091 +43.626285714 +37.87 +48.29 +48.29 +78.93 +71.92 +115.38 +111.36268657 +119.67357143 +108.09 +160.06 +156.74026316 +144.1 +63.172340426 +106.29 +46.09 +53.450666667 +46.09 +46.039347826 +51.66 +126.82 +95.94 +95.94 +75.35 +82.47 +30.55 +173.82 +334.77 +395.39 +16.188461538 +110.0 +89.285714286 +79.081632653 +22.8 +45.62 +23.436666667 +25.024369748 +51.0 +121.04 +70.34 +58.42 +34.76 +39.92 +50.427676349 +109.68873684 +70.219272349 +66.76 +110.22 +172.84 +24.4 +24.08 +24.4 +24.4 +33.82 +38.0 +25.558333333 +106.44 +69.02 +101.28 +101.28 +41.01 +153.43 +49.5 +25.89 +25.685363409 +25.89 +35.97 +31.827777778 +44.23625 +72.699166667 +110.7 +110.02846154 +164.66848485 +404.106875 +80.248571429 +76.312105263 +44.43 +99.355 +93.657777778 +113.51686275 +265.9875 +276.06666667 +195.10652174 +130.95761905 +92.619285714 +181.03076923 +191.02272727 +95.182857143 +102.43 +102.79714286 +107.72181818 +117.61307692 +134.18545455 +142.01181818 +113.31272727 +135.55916667 +124.28416667 +137.61 +375.84923077 +203.55166667 +404.44823529 +151.60909091 +169.69777778 +157.56944444 +364.14866667 +169.29833333 +178.79214286 +177.02764706 +156.73636364 +341.325 +356.28083333 +337.70333333 +326.89615385 +182.67285714 +323.52727273 +332.00681818 +178.07933333 +336.68235294 +317.75 +325.67666667 +183.08461538 +142.94454545 +163.866 +162.56866667 +149.31083333 +183.82083333 +390.33588235 +185.18666667 +155.05133333 +172.07541667 +162.79090909 +153.37 +157.49117647 +160.37333333 +183.99071429 +135.74285714 +139.48307692 +148.81142857 +156.59230769 +153.33363636 +145.98166667 +153.24071429 +148.33375 +161.07357143 +157.88454545 +151.10583333 +149.21357143 +344.69285714 +337.353125 +151.46692308 +153.45037037 +153.8873913 +148.99357143 +180.1475 +168.69 +167.29785714 +165.60421053 +187.625 +184.34615385 +190.78933333 +196.995 +121.82703704 +401.35588235 +294.22380952 +355.52857143 +67.186363636 +82.175 +128.91615385 +231.05705882 +91.531173184 +90.953633333 +180.605 +192.99285714 +132.22470588 +242.2128 +123.0 +125.434375 +125.05 +123.68333333 +125.64791667 +126.034 +123.95666667 +124.05428571 +121.86111111 +124.15 +56.305357143 +61.503636364 +125.21803279 +148.56642857 +123.42205882 +58.92 +122.46521739 +125.1140625 +127.06339286 +124.91333333 +120.97985437 +126.16186441 +161.72083333 +124.48625 +127.1 +62.3745 +59.531923077 +127.06693548 +62.072777778 +103.955 +61.268571429 +63.16875 +126.53448276 +62.7 +66.775 +62.0790625 +60.221388889 +107.40785714 +208.37095238 +142.71538462 +129.07538462 +111.1132 +103.84875 +66.628181818 +126.54181818 +57.292972973 +66.043333333 +61.038181818 +90.576363636 +64.095882353 +61.162 +65.881818182 +82.889583333 +57.402857143 +54.670666667 +55.60875 +60.245909091 +53.815666667 +60.094375 +58.604285714 +53.930555556 +54.467727273 +54.035714286 +55.424137931 +138.4925 +68.205384615 +53.386 +52.052926829 +50.411428571 +51.708333333 +54.67030303 +54.298571429 +62.300555556 +66.764 +69.589444444 +119.84907407 +133.64540541 +60.625 +63.7172 +60.858888889 +130.08181818 +60.888666667 +61.571333333 +71.0875 +64.851333333 +63.953478261 +64.0864 +66.982413793 +63.175789474 +71.818125 +49.323529412 +73.357826087 +66.2588 +67.16 +64.213870968 +69.029428571 +61.152368421 +55.125555556 +131.5075 +63.251764706 +65.416818182 +110.03375 +61.63125 +56.975833333 +56.451428571 +69.152307692 +57.214285714 +62.358333333 +130.175 +189.81375 +202.61636364 +136.44518519 +132.38517241 +206.31083333 +204.44166667 +178.29642857 +158.72692308 +194.52461538 +198.29230769 +187.77230769 +180.55230769 +232.86090909 +125.52272727 +219.87181818 +180.72181818 +223.30111111 +224.33571429 +219.19418605 +222.29217391 +188.31 +184.47615385 +190.59769231 +194.25705882 +165.54 +173.37948052 +179.35375 +169.54 +168.35294118 +168.0156 +165.18533333 +192.14714286 +267.54333333 +243.28733333 +246.28133333 +163.96222222 +139.05867925 +168.82071429 +134.76666667 +172.29555556 +163.79 +166.58333333 +152.49285714 +162.56455224 +153.36714286 +149.13928571 +137.9875 +149.27045455 +138.94576923 +167.91818182 +172.90142857 +154.33269231 +153.79 +136.86419048 +122.32595745 +148.18384615 +152.11 +157.9744 +142.76002857 +152.10789474 +137.38034483 +150.29624113 +154.88944444 +144.72590909 +124.65931818 +100.87705882 +128.42615385 +128.58416667 +129.87210526 +89.9565 +73.5375 +79.203913043 +82.802619048 +137.80928571 +149.9148 +146.95310345 +123.96928571 +156.88666667 +122.13321311 +128.8078022 +79.849230769 +127.29525862 +129.42352941 +144.61952381 +131.72307692 +127.19090909 +143.42818182 +130.62384615 +123.12271523 +133.41789474 +133.28608696 +129.0721875 +144.16909091 +158.64272727 +150.46777778 diff --git a/data/samples/cms25.csv b/data/samples/cms25.csv new file mode 100644 index 0000000..38170fd --- /dev/null +++ b/data/samples/cms25.csv @@ -0,0 +1,1024 @@ +3.749236589 +0.0919935319 +3.8361690819 +4.8864346741 +2.6433821139 +2.4418322811 +0.7275816105 +15.520606614 +17.98974279 +28.780029241 +3.9265424915 +14.899094412 +5.0582415374 +1.0602999454 +11.269735199 +6.2770032991 +0.413622001 +0.3411312217 +0.4378538299 +4.9731822591 +0.4462290983 +23.40319762 +17.608103268 +19.299201633 +21.969316987 +16.973725937 +2.8586600582 +9.0866271498 +3.6134893915 +10.375066724 +14.363405295 +20.418102387 +20.873824716 +1.0584786116 +2.1728984337 +0.2572548153 +4.1008673343 +2.0923132741 +0.4848405392 +7.662816214 +3.4213121148 +5.6383155518 +0.2852998423 +9.7841525523 +0.5268350221 +13.036269214 +16.483995416 +0.2232660988 +0.0 +1.5644763615 +15.001063979 +0.0 +17.72506055 +17.447279649 +2.889485492 +1.6069903211 +0.6192552592 +8.2349045566 +9.4262935585 +8.9806520339 +12.813698039 +28.299904131 +5.1386414833 +0.8601923017 +1.3852760555 +5.5660854124 +5.1663229001 +18.501185473 +15.956722495 +38.786935972 +57.473326041 +24.621215571 +53.256375935 +28.891696171 +12.828853391 +14.960716936 +36.817926313 +13.946746673 +12.225831832 +7.2605057936 +10.738286488 +33.930606435 +9.2219660607 +8.0206387241 +20.723862639 +23.410478303 +17.254153123 +60.31483651 +148.66711969 +57.355971915 +51.807896097 +56.247541621 +22.557424881 +34.575818447 +9.4200428019 +20.665385497 +28.452608641 +8.569572022 +9.9201135126 +9.4756749493 +29.540479701 +13.666382577 +19.323709476 +12.897639788 +33.387083264 +13.000563129 +2.0279917695 +1.5503404644 +0.1539679267 +76.313378084 +1.0157763794 +2.3303105174 +3.7758707605 +45.900021911 +30.047048672 +1.9026919578 +3.0477147651 +0.089566859 +0.679299456 +1.353448271 +0.4656284748 +0.6673971831 +21.833023627 +22.947173718 +48.992756253 +14.554295553 +15.242811744 +0.8370157323 +0.0856794738 +0.0580409338 +51.794596148 +81.203491644 +74.095901323 +16.587550295 +4.0229335171 +93.585876411 +2.4277693465 +37.194223489 +0.0 +289.91124951 +0.4189638011 +42.149241841 +0.0 +0.0 +6.0009103219 +4.2187757367 +6.5746935998 +8.0350627614 +0.220836478 +0.181895123 +7.5604411198 +4.0577317505 +8.9017667004 +8.1228638501 +8.3445793924 +9.6556765359 +7.1655482296 +1.4640752857 +5.4658639447 +10.030731437 +9.7184898327 +10.462701578 +0.1980227708 +0.1272163512 +0.9899494937 +4.8707711003 +4.0792797695 +0.3237359397 +16.41172224 +17.530941214 +21.065354427 +24.43136814 +12.392468619 +23.47304013 +39.50140737 +27.098412415 +1.8702876784 +3.7869937024 +0.0948683298 +9.3143980055 +0.6688312224 +20.02942235 +29.648412328 +17.676027382 +11.778311309 +20.497217184 +11.045041145 +37.9731762 +28.94675785 +38.579949987 +3.2951976776 +4.3362859569 +0.3830111234 +0.3487770178 +0.6138924082 +0.1984406668 +33.246620284 +0.3662686855 +5.2181676087 +23.538942004 +32.95452306 +8.3915134845 +16.382078169 +2.4899612982 +13.84849221 +22.274039881 +14.440167219 +44.203376689 +7.5772519912 +16.883297116 +13.246905413 +42.692225884 +13.644376422 +12.102707034 +12.691668287 +12.662771999 +17.661231634 +14.223072456 +9.731148933 +11.544009043 +14.737254639 +2.054752033 +26.468556331 +8.1354868055 +39.156892186 +28.644065894 +13.951819079 +81.800428687 +8.8599719226 +35.885954617 +7.246165024 +13.067754339 +6.9100834075 +12.886034781 +9.0921821671 +15.71291955 +14.864107578 +16.143032307 +7.1399289577 +9.7349937044 +23.349102569 +10.446429112 +20.138665876 +18.729421237 +6.9716429168 +28.078467222 +16.234679248 +26.742423992 +27.621096291 +29.286281671 +12.404754178 +19.354855203 +0.8066408047 +10.57059029 +20.084542742 +3.4754334511 +37.829986563 +59.209405882 +34.571649603 +79.512137997 +11.946174181 +55.907709789 +37.023361619 +54.130197173 +25.325416149 +21.562202965 +27.880645953 +56.328483229 +35.609715167 +46.640149908 +33.951833529 +62.256577185 +30.371297101 +19.808126975 +64.008405458 +39.137375351 +55.719754853 +44.613190905 +43.412463889 +40.927279437 +36.404114507 +45.170256336 +53.724632976 +43.008245035 +19.6893305 +34.491809855 +25.009054471 +69.975850037 +76.844903238 +67.046507055 +57.490291004 +57.429981947 +74.65019357 +61.570932918 +57.169476121 +55.707632826 +122.30330175 +90.778808008 +31.869583069 +57.107873261 +62.120952229 +55.044517949 +55.233776175 +54.979476935 +57.654772949 +51.318856184 +61.042312451 +37.138964817 +95.213158325 +102.89763993 +79.888204897 +62.57633889 +54.700427653 +73.321772152 +26.789592565 +52.631770354 +77.16444602 +59.783190307 +35.401620679 +48.299504318 +37.829645927 +67.610716991 +53.280166203 +35.687222151 +28.339763976 +50.587060616 +69.89005166 +49.795044243 +50.452489641 +80.222792855 +113.45725528 +82.420574444 +45.657795781 +48.49277006 +44.140424653 +48.439697626 +74.344662062 +57.737796587 +58.901371989 +53.850857784 +29.333529198 +59.327067126 +47.529121857 +65.064116104 +89.219228694 +26.72474334 +74.015238014 +70.902080706 +30.274046885 +40.275325256 +56.671180182 +67.641734816 +60.690158178 +88.084960196 +61.080917578 +58.102102724 +69.388377189 +55.14977031 +74.755984823 +70.353451785 +40.456199109 +85.038863154 +49.734828583 +11.426603017 +53.608672505 +78.40611452 +51.525376704 +70.370244911 +54.085832263 +50.880701044 +42.180104006 +22.477997974 +49.277793347 +14.892123454 +61.354368643 +100.57310405 +49.357074883 +130.08561273 +71.256252513 +134.1572284 +124.71876538 +74.140995655 +85.457419724 +91.334273429 +66.749320087 +107.66778828 +62.608472307 +66.471939373 +114.02613372 +101.01780642 +188.40707854 +121.87029154 +120.59397342 +25.604452658 +58.709377774 +73.581592052 +90.079306051 +36.81364769 +44.508105069 +23.462872729 +27.338105617 +51.338323175 +38.880621957 +31.758861914 +29.466730845 +29.264258853 +14.621965881 +39.529184637 +33.290471697 +10.171582521 +31.506884245 +33.70619584 +29.606641839 +7.657741972 +24.649925717 +13.891530192 +17.039005548 +17.615955372 +30.020391752 +56.142491478 +7.6600394881 +14.139850187 +12.190159389 +32.128666431 +21.081828471 +23.921717164 +9.74527275 +19.29809378 +22.451680759 +24.073284376 +50.316548023 +25.674181445 +35.874126223 +14.444651524 +11.021340032 +15.401513643 +12.61745345 +17.404234758 +9.3796968235 +27.36948923 +23.212829636 +17.329678992 +30.252110873 +40.454545822 +28.15320836 +11.395267821 +15.313966694 +16.527044654 +13.528376116 +9.7616836328 +8.8283678028 +30.097215346 +40.619740695 +7.6100798749 +30.561007689 +16.876334695 +12.066202448 +46.919331945 +12.326242463 +10.007960205 +16.47974023 +12.322296781 +8.7007568284 +15.431167291 +16.452079369 +13.449049438 +6.2104083522 +17.626785403 +14.173645654 +0.6821251165 +6.7444500868 +9.6748540041 +8.5875600976 +14.9667551 +3.2618999022 +5.5083013419 +7.4989613226 +13.317461804 +40.268984702 +8.4313154432 +11.76332867 +13.072324883 +6.450771524 +42.95557713 +27.703306561 +21.93618054 +19.540022486 +13.308318201 +13.746318743 +60.333232182 +22.143434795 +7.3884130248 +42.539655512 +36.287100682 +50.978528491 +40.562542305 +20.837109273 +31.625147741 +39.128239634 +36.110169742 +37.364393639 +32.897723899 +32.113002789 +8.9882032255 +17.145208647 +37.597823594 +18.056041675 +15.315163001 +23.870061232 +37.859646063 +10.684867041 +11.950313215 +17.071900938 +7.8163584342 +13.978188488 +13.315918972 +7.7114301163 +16.692979743 +8.3016595604 +17.053378122 +6.3993838438 +17.217882119 +19.371859946 +13.865251341 +14.989154584 +19.963909296 +9.925195648 +10.720137444 +19.518265606 +8.3373999582 +13.63009343 +2.9395818577 +4.9483578728 +0.2002651304 +0.3489629607 +10.761579988 +3.2946236128 +19.637737281 +5.3308816367 +0.3938296271 +3.6917117896 +1.8914754602 +1.4004867958 +11.803380119 +9.0529657958 +0.1791254798 +6.0440052945 +3.9909895894 +2.955812261 +8.1461548631 +0.1746016108 +0.7890699751 +1.6026178584 +36.932650009 +0.0757682546 +1.1840824463 +7.1699627451 +8.5826444332 +7.8438699972 +3.518501458 +5.6705719171 +5.6230506556 +6.7752646591 +5.7821781952 +6.735809602 +3.0589257116 +7.1172853373 +6.8132505735 +8.0059665875 +4.8393994575 +0.0 +6.7931486808 +6.1115941151 +9.5929930554 +6.8061622072 +8.8729243283 +6.9572293613 +1.8575534462 +8.6138213479 +8.614058277 +5.6819050459 +7.8307094509 +7.1514857159 +4.904586168 +8.0236937914 +4.8973497917 +7.5191182677 +6.1166227597 +9.1660497188 +8.2951190844 +7.3731085628 +28.874051046 +32.311457414 +33.319496521 +27.143212369 +35.426658935 +29.612463094 +33.633332569 +27.608192957 +35.502570568 +45.112847604 +39.70919064 +37.075784136 +6.0575586353 +21.377922735 +31.529021996 +3.0120129723 +0.0474698615 +0.0 +0.0 +0.9872570806 +1.2321742581 +1.6767155394 +7.6909353514 +0.3582674358 +13.236534476 +9.6641400381 +22.513651423 +30.416791306 +11.127702962 +18.466657022 +3.387644082 +28.710030711 +1.236990825 +28.747722983 +18.961713605 +27.237623822 +6.6652938531 +45.512408053 +29.300211672 +31.615180619 +57.212900436 +51.222012718 +3.1715295994 +4.7593148332 +45.03344214 +40.754912251 +38.181806224 +4.2680839067 +2.190742053 +3.8742642156 +0.4898979486 +0.3163426636 +0.3941548233 +0.8440251847 +1.0504109983 +0.3124275632 +15.382204352 +15.743166838 +17.957470743 +16.104954066 +23.661734521 +18.332309712 +33.140301041 +18.249876526 +31.03832592 +32.100719057 +28.141938941 +19.104845308 +36.167699323 +0.3728380948 +1.0391770248 +0.3562402589 +0.2480042266 +0.3452927094 +0.950287335 +21.246039052 +16.172450636 +0.7602498676 +22.990982051 +0.1867978201 +26.193386721 +1.3804110597 +18.795523902 +3.0744955809 +14.287767852 +20.699845388 +15.558239133 +6.3457535746 +14.41733445 +3.7381487574 +4.0761509177 +9.2693849104 +5.7070396953 +13.923514069 +15.885233555 +0.1959591794 +9.1729073129 +3.9656376573 +13.347489025 +3.6459800112 +6.7206115629 +0.4538378014 +33.144171062 +8.4789509639 +3.2193056563 +0.117 +5.9507235693 +0.0 +8.9228149928 +0.6577972501 +0.8683636345 +20.965282663 +35.124925623 +22.082206281 +14.553616284 +47.743412573 +11.182084919 +0.0 +6.7114589899 +5.3231707944 +4.9603091064 +3.6811308984 +3.8045546306 +20.586100354 +5.7006353339 +2.3841533361 +30.380539606 +91.831358902 +22.052504613 +21.142338036 +5.9992393036 +12.038479059 +11.000800956 +15.55162702 +45.21690345 +26.756991059 +20.888936788 +38.325394724 +3.7875424831 +20.717301634 +19.709061137 +9.3277504561 +6.7984962698 +7.3516311185 +18.528229568 +35.473891257 +22.202145455 +18.877411225 +7.9786980856 +11.381369615 +13.146953597 +19.573220421 +33.513164701 +16.924997354 +111.76181058 +17.127717472 +14.122279974 +12.394880427 +145.26604746 +10.439128981 +11.752951887 +20.563527325 +8.990782424 +14.911306928 +30.234612018 +26.660634518 +50.091216275 +10.976909833 +20.44073643 +19.660269986 +17.585319509 +51.650540027 +35.822934595 +35.382870281 +16.532445274 +7.6321115868 +15.527746549 +11.793973866 +3.7429819515 +14.267810702 +42.510898102 +41.738913854 +8.852184062 +18.799451856 +18.354843034 +7.9895579313 +16.732499529 +13.593695306 +39.170220734 +5.7760941626 +8.3881740507 +11.64732129 +16.99431043 +18.57315903 +12.52272257 +12.352871041 +20.941212208 +24.221773189 +17.356204823 +8.0493627681 +8.1134541827 +25.526388717 +37.006061307 +24.601672359 +20.231295061 +15.699014449 +7.9999568876 +6.3243357139 +35.083645305 +16.087246137 +10.281589711 +10.696217761 +18.648480465 +24.523420606 +22.623182929 +8.7255116389 +28.834982442 +40.034855704 +36.78329757 +6.2574639068 +18.23254005 +13.350707004 +72.919283346 +21.384252086 +19.701642899 +6.1245067352 +33.261119339 +31.537471501 +92.784110964 +4.5972794686 +7.4751830036 +4.7108801044 +5.9185563524 +6.6679850649 +6.7814051641 +6.1358812099 +6.0203914308 +4.6241283713 +6.0361697516 +1.140028363 +2.1072859145 +10.804516492 +16.482235536 +9.3177751959 +2.6246120018 +4.7671918712 +5.7161126091 +16.199839175 +8.3644755697 +7.7957899182 +6.3671962381 +9.3813116531 +2.6434293541 +2.0056434268 +2.7796683255 +2.0901017783 +3.064705529 +2.6721027924 +8.1969955116 +2.9099270487 +4.3291526107 +11.840682465 +14.504941878 +21.175076139 +11.314664913 +2.112790414 +20.187457272 +30.038759799 +27.030253311 +25.968948959 +24.058155986 +21.513494738 +9.9406721924 +24.532078544 +7.9732846481 +11.058957359 +7.3501569107 +19.550665817 +8.7358926711 +13.850406107 +8.5690218542 +21.593660769 +4.1695840467 +5.5387757974 +5.5070203774 +11.432379825 +10.450614846 +8.3272042643 +10.847206453 +11.945545337 +8.6297876878 +14.465541525 +12.585530251 +25.757041796 +21.020281978 +10.822108852 +13.799806463 +10.00502748 +9.6090645862 +6.6580891928 +6.9620964706 +4.3401449669 +9.4275121262 +6.5923577387 +19.298742858 +41.228964336 +13.507583433 +12.954884015 +11.894347345 +6.0453775798 +7.9506501761 +5.7525524383 +11.71280149 +4.5803959072 +7.8580059713 +11.325990754 +8.7148639954 +7.1544843269 +9.2860227417 +2.480356041 +19.052031095 +9.2942367002 +9.0886214908 +9.9135612349 +18.13819127 +8.026572869 +17.479715625 +30.623247374 +9.0162537967 +10.248065691 +26.31171051 +15.664958323 +9.6665353439 +8.6146566196 +8.0829293419 +11.690286392 +15.848519625 +19.344740169 +45.359439821 +51.625004528 +30.780963012 +26.044660039 +46.13093746 +66.556822523 +68.252232975 +15.367570673 +43.079650985 +30.716170423 +15.710722943 +14.650236301 +61.734416403 +34.558279485 +47.545742638 +13.643558466 +55.858977655 +34.173298014 +46.648367233 +40.333621614 +36.328997438 +26.971935689 +27.379285393 +38.502652486 +25.757011181 +25.830323391 +31.477072584 +16.691215683 +30.13196316 +39.780171817 +18.818055113 +27.280685111 +92.38802586 +42.481803582 +46.07609998 +12.621112053 +10.662011746 +18.054073032 +13.578064111 +17.390544443 +10.88176635 +23.385689676 +7.5431228646 +28.982375752 +11.083882604 +42.089020219 +28.795408362 +31.309694549 +14.859978836 +33.377991487 +31.287632239 +26.340083074 +13.573163207 +11.961105385 +17.058841658 +23.275424986 +21.889013966 +28.780875194 +15.676268216 +23.094103107 +19.8851884 +21.974174227 +32.336236682 +21.208437897 +9.9833587124 +28.884912861 +4.999883904 +19.638612954 +11.407940931 +39.077997041 +7.5752398087 +18.580825625 +27.608793069 +16.980939012 +24.702688144 +26.616511174 +7.8387160548 +10.730192211 +10.377837367 +10.950521111 +15.220187217 +27.417234758 +9.3409516609 +19.940440239 +10.885074576 +14.984332305 +14.5610803 +12.660891562 +13.733301823 +15.819069885 +13.84215999 +20.852342543 +35.340483938 +16.733791763 +19.033650675 diff --git a/data/samples/cms9.csv b/data/samples/cms9.csv new file mode 100644 index 0000000..fb0bd04 --- /dev/null +++ b/data/samples/cms9.csv @@ -0,0 +1,1024 @@ +16.0 +22.0 +93.0 +198.0 +20.0 +15.0 +32.0 +87.0 +43.0 +19.0 +80.0 +14.0 +37.0 +274.0 +682.0 +106.0 +24.0 +46.0 +13.0 +331.0 +42.0 +65.0 +87.0 +70.0 +27.0 +179.0 +215.0 +25.0 +554.0 +116.0 +92.0 +28.0 +36.0 +637.0 +33.0 +50.0 +239.0 +432.0 +39.0 +563.0 +124.0 +185.0 +150.0 +41.0 +208.0 +597.0 +1035.0 +17.0 +70.0 +296.0 +101.0 +77.0 +107.0 +313.0 +309.0 +469.0 +95.0 +515.0 +606.0 +599.0 +396.0 +28.0 +153.0 +34.0 +40.0 +22.0 +39.0 +12.0 +19.0 +12.0 +15.0 +11.0 +16.0 +12.0 +11.0 +11.0 +14.0 +20.0 +19.0 +13.0 +11.0 +14.0 +18.0 +27.0 +15.0 +14.0 +12.0 +30.0 +31.0 +14.0 +14.0 +13.0 +78.0 +53.0 +65.0 +149.0 +252.0 +141.0 +224.0 +151.0 +12.0 +41.0 +13.0 +17.0 +15.0 +12.0 +18.0 +12.0 +14.0 +13.0 +11.0 +11.0 +26.0 +48.0 +11.0 +13.0 +11.0 +15.0 +34.0 +30.0 +27.0 +60.0 +26.0 +51.0 +11.0 +17.0 +18.0 +27.0 +12.0 +12.0 +11.0 +14.0 +15.0 +44.0 +13.0 +68.0 +15.0 +16.0 +13.0 +18.0 +25.0 +19.0 +11.0 +11.0 +18.0 +386.0 +213.0 +106.0 +212.0 +197.0 +271.0 +70.0 +638.0 +239.0 +420.0 +152.0 +729.0 +173.0 +550.0 +233.0 +22.0 +12.0 +13.0 +15.0 +12.0 +54.0 +16.0 +11.0 +68.0 +149.0 +14.0 +101.0 +27.0 +255.0 +31.0 +52.0 +15.0 +26.0 +23.0 +14.0 +14.0 +14.0 +14.0 +250.0 +18.0 +374.0 +30.0 +11.0 +49.0 +12.0 +16.0 +76.0 +11.0 +14.0 +12.0 +13.0 +11.0 +20.0 +37.0 +130.0 +22.0 +25.0 +17.0 +17.0 +39.0 +24.0 +237.0 +35.0 +125.0 +141.0 +32.0 +264.0 +23.0 +44.0 +27.0 +113.0 +486.0 +11.0 +15.0 +37.0 +12.0 +13.0 +25.0 +13.0 +23.0 +16.0 +13.0 +18.0 +18.0 +20.0 +487.0 +65.0 +91.0 +58.0 +23.0 +322.0 +107.0 +112.0 +122.0 +294.0 +1103.0 +22.0 +14.0 +98.0 +50.0 +55.0 +396.0 +12.0 +11.0 +23.0 +15.0 +19.0 +28.0 +64.0 +49.0 +19.0 +12.0 +15.0 +29.0 +24.0 +14.0 +12.0 +12.0 +20.0 +17.0 +11.0 +12.0 +17.0 +14.0 +11.0 +12.0 +18.0 +11.0 +24.0 +13.0 +15.0 +16.0 +20.0 +14.0 +12.0 +14.0 +17.0 +11.0 +23.0 +11.0 +11.0 +24.0 +11.0 +11.0 +12.0 +14.0 +11.0 +20.0 +23.0 +12.0 +14.0 +11.0 +12.0 +18.0 +11.0 +11.0 +13.0 +12.0 +13.0 +13.0 +16.0 +15.0 +13.0 +15.0 +15.0 +17.0 +18.0 +17.0 +16.0 +13.0 +18.0 +19.0 +28.0 +23.0 +15.0 +16.0 +13.0 +21.0 +24.0 +12.0 +13.0 +20.0 +14.0 +16.0 +17.0 +21.0 +21.0 +30.0 +37.0 +17.0 +14.0 +26.0 +20.0 +12.0 +13.0 +14.0 +16.0 +21.0 +22.0 +12.0 +17.0 +18.0 +16.0 +13.0 +14.0 +14.0 +15.0 +19.0 +19.0 +25.0 +12.0 +12.0 +11.0 +56.0 +14.0 +17.0 +16.0 +21.0 +19.0 +21.0 +13.0 +18.0 +15.0 +14.0 +14.0 +14.0 +13.0 +13.0 +12.0 +38.0 +17.0 +12.0 +15.0 +12.0 +28.0 +14.0 +13.0 +16.0 +18.0 +18.0 +24.0 +33.0 +11.0 +24.0 +17.0 +18.0 +11.0 +17.0 +14.0 +18.0 +15.0 +13.0 +16.0 +14.0 +11.0 +12.0 +11.0 +14.0 +13.0 +15.0 +15.0 +15.0 +60.0 +11.0 +12.0 +73.0 +11.0 +12.0 +16.0 +13.0 +19.0 +13.0 +14.0 +14.0 +15.0 +11.0 +13.0 +16.0 +22.0 +12.0 +11.0 +12.0 +17.0 +15.0 +24.0 +19.0 +15.0 +11.0 +12.0 +13.0 +13.0 +15.0 +16.0 +16.0 +19.0 +14.0 +14.0 +14.0 +11.0 +12.0 +17.0 +28.0 +31.0 +11.0 +14.0 +14.0 +13.0 +22.0 +11.0 +12.0 +11.0 +17.0 +14.0 +26.0 +13.0 +17.0 +14.0 +14.0 +11.0 +15.0 +17.0 +11.0 +11.0 +14.0 +13.0 +69.0 +13.0 +11.0 +15.0 +12.0 +14.0 +28.0 +18.0 +11.0 +14.0 +144.0 +68.0 +13.0 +22.0 +14.0 +12.0 +20.0 +18.0 +24.0 +24.0 +39.0 +73.0 +46.0 +19.0 +23.0 +30.0 +36.0 +37.0 +38.0 +43.0 +73.0 +74.0 +77.0 +12.0 +15.0 +20.0 +12.0 +13.0 +62.0 +11.0 +14.0 +16.0 +11.0 +13.0 +18.0 +19.0 +26.0 +69.0 +41.0 +87.0 +36.0 +28.0 +40.0 +20.0 +75.0 +11.0 +17.0 +41.0 +36.0 +17.0 +44.0 +58.0 +50.0 +21.0 +13.0 +119.0 +80.0 +69.0 +41.0 +13.0 +48.0 +166.0 +14.0 +23.0 +373.0 +16.0 +14.0 +382.0 +70.0 +13.0 +59.0 +62.0 +20.0 +26.0 +13.0 +11.0 +283.0 +51.0 +214.0 +290.0 +307.0 +187.0 +46.0 +172.0 +28.0 +108.0 +241.0 +203.0 +30.0 +16.0 +87.0 +130.0 +235.0 +298.0 +189.0 +148.0 +353.0 +142.0 +441.0 +124.0 +90.0 +590.0 +247.0 +161.0 +119.0 +300.0 +288.0 +80.0 +734.0 +422.0 +207.0 +19.0 +54.0 +16.0 +51.0 +21.0 +23.0 +14.0 +17.0 +17.0 +49.0 +13.0 +24.0 +19.0 +47.0 +34.0 +17.0 +35.0 +30.0 +12.0 +72.0 +26.0 +32.0 +20.0 +12.0 +29.0 +136.0 +237.0 +16.0 +211.0 +23.0 +42.0 +52.0 +28.0 +55.0 +46.0 +63.0 +11.0 +77.0 +128.0 +27.0 +13.0 +69.0 +18.0 +79.0 +16.0 +42.0 +14.0 +32.0 +161.0 +73.0 +11.0 +148.0 +18.0 +44.0 +35.0 +65.0 +14.0 +23.0 +76.0 +11.0 +31.0 +67.0 +14.0 +21.0 +43.0 +114.0 +88.0 +47.0 +12.0 +140.0 +45.0 +49.0 +46.0 +31.0 +37.0 +22.0 +50.0 +36.0 +30.0 +17.0 +21.0 +14.0 +36.0 +65.0 +14.0 +21.0 +98.0 +11.0 +24.0 +99.0 +119.0 +53.0 +12.0 +63.0 +58.0 +21.0 +14.0 +482.0 +190.0 +481.0 +197.0 +44.0 +22.0 +99.0 +203.0 +30.0 +168.0 +39.0 +262.0 +12.0 +19.0 +86.0 +14.0 +49.0 +31.0 +36.0 +11.0 +47.0 +399.0 +47.0 +116.0 +18.0 +16.0 +72.0 +18.0 +26.0 +33.0 +16.0 +14.0 +19.0 +12.0 +14.0 +27.0 +51.0 +12.0 +21.0 +23.0 +21.0 +14.0 +13.0 +11.0 +14.0 +14.0 +14.0 +11.0 +13.0 +11.0 +11.0 +11.0 +12.0 +12.0 +12.0 +13.0 +12.0 +17.0 +11.0 +18.0 +18.0 +15.0 +12.0 +14.0 +17.0 +11.0 +12.0 +12.0 +15.0 +13.0 +14.0 +11.0 +22.0 +15.0 +17.0 +11.0 +15.0 +13.0 +22.0 +15.0 +15.0 +12.0 +12.0 +17.0 +21.0 +15.0 +24.0 +11.0 +16.0 +17.0 +24.0 +14.0 +14.0 +13.0 +28.0 +13.0 +22.0 +12.0 +14.0 +16.0 +14.0 +66.0 +12.0 +14.0 +14.0 +16.0 +13.0 +27.0 +46.0 +14.0 +12.0 +14.0 +14.0 +19.0 +22.0 +13.0 +15.0 +16.0 +27.0 +17.0 +42.0 +14.0 +11.0 +12.0 +13.0 +17.0 +537.0 +300.0 +20.0 +14.0 +17.0 +25.0 +11.0 +16.0 +18.0 +21.0 +24.0 +25.0 +30.0 +35.0 +36.0 +41.0 +28.0 +22.0 +61.0 +14.0 +34.0 +25.0 +23.0 +32.0 +168.0 +15.0 +206.0 +59.0 +12.0 +40.0 +12.0 +20.0 +26.0 +62.0 +18.0 +12.0 +35.0 +24.0 +29.0 +12.0 +14.0 +32.0 +36.0 +14.0 +21.0 +13.0 +13.0 +25.0 +16.0 +11.0 +11.0 +37.0 +21.0 +11.0 +11.0 +17.0 +15.0 +11.0 +24.0 +14.0 +15.0 +16.0 +22.0 +30.0 +16.0 +35.0 +18.0 +22.0 +21.0 +29.0 +12.0 +13.0 +25.0 +41.0 +28.0 +36.0 +33.0 +35.0 +18.0 +15.0 +18.0 +54.0 +37.0 +14.0 +25.0 +27.0 +11.0 +15.0 +15.0 +20.0 +15.0 +23.0 +25.0 +29.0 +19.0 +16.0 +17.0 +23.0 +25.0 +27.0 +31.0 +35.0 +38.0 +18.0 +40.0 +17.0 +22.0 +40.0 +24.0 +24.0 +28.0 +13.0 +21.0 +12.0 +12.0 +16.0 +11.0 +27.0 +29.0 +12.0 +12.0 +14.0 +13.0 +13.0 +13.0 +13.0 +13.0 +11.0 +11.0 +11.0 +11.0 +27.0 +35.0 +43.0 +23.0 +19.0 +13.0 +13.0 +17.0 +28.0 +77.0 +16.0 +60.0 +68.0 +25.0 +15.0 +14.0 +12.0 +15.0 +15.0 +18.0 +53.0 +14.0 +12.0 +36.0 +12.0 +21.0 +14.0 +134.0 +14.0 +14.0 +20.0 +22.0 +26.0 +11.0 +14.0 +26.0 +40.0 +105.0 +141.0 +13.0 +19.0 +25.0 +350.0 +19.0 +29.0 +141.0 +18.0 +22.0 +220.0 +17.0 +13.0 +12.0 +114.0 +20.0 +24.0 +23.0 +42.0 +98.0 +75.0 +29.0 +14.0 +15.0 +305.0 +91.0 +52.0 +348.0 +51.0 +21.0 +13.0 +44.0 +11.0 +13.0 +302.0 +38.0 +23.0 +96.0 +11.0 +11.0 +18.0 diff --git a/data/samples/food_prices.csv b/data/samples/food_prices.csv new file mode 100644 index 0000000..c9fd56f --- /dev/null +++ b/data/samples/food_prices.csv @@ -0,0 +1,1024 @@ +15.375 +14.0333 +14.25 +14.875 +13.975 +14.15 +14.35 +13.725 +13.45 +13.575 +13.725 +13.225 +14.7 +16.275 +14.95 +15.6 +16.375 +23.45 +34.7 +39.175 +33.175 +31.625 +19.0 +31.5175 +24.8925 +24.0325 +23.925 +21.2 +18.5925 +18.7925 +16.9325 +18.535 +16.42 +16.875 +16.6775 +16.5 +16.575 +15.7 +15.9 +13.85 +13.15 +12.465 +12.3125 +12.2 +13.2125 +13.2125 +13.875 +14.25 +15.25 +15.75 +15.5 +17.5 +17.375 +22.05 +22.14 +21.75 +21.5 +21.0 +18.8125 +19.2125 +18.875 +18.0 +18.0 +18.3333 +19.75 +18.875 +19.6667 +21.25 +20.0 +19.625 +19.5 +20.0 +19.25 +18.75 +20.0 +20.0 +20.0 +20.25 +21.0 +21.0 +21.0 +21.0 +20.0 +20.0 +23.0 +23.0 +22.5 +22.0 +22.0 +22.0 +22.0 +23.0 +24.0 +24.0 +24.0 +24.0 +24.0 +23.25 +23.0 +22.0 +21.0 +21.0 +21.0 +21.25 +23.0 +23.0 +23.0 +23.0 +23.0 +23.0 +23.0 +23.0 +23.0 +23.0 +23.0 +23.0 +23.0 +23.0 +23.0 +23.0 +23.0 +23.0 +23.0 +23.0 +23.0 +23.0 +23.0 +23.0 +22.5 +22.0 +20.5 +20.0 +20.0 +20.0 +20.0 +20.0 +20.0 +21.5 +23.0 +23.0 +23.0 +24.0 +24.0 +24.0 +24.0 +22.5 +21.25 +21.0 +22.0 +22.0 +22.0 +23.0 +23.0 +23.0 +24.4167 +28.5 +17.3333 +21.0 +27.0 +27.75 +26.875 +30.5 +32.0 +31.25 +32.75 +30.75 +30.0 +30.25 +31.5 +33.25 +33.75 +37.25 +44.75 +51.0 +53.25 +52.535 +48.0 +42.5 +43.0 +40.25 +40.0 +37.5 +37.0 +37.25 +37.0 +37.75 +39.6425 +39.0 +40.0 +39.0 +39.0 +36.25 +33.5 +30.25 +33.5 +32.0 +30.5 +28.75 +27.5 +29.75 +29.25 +29.75 +30.0 +31.25 +32.0 +33.75 +36.25 +35.5 +35.5 +35.0 +37.0 +37.75 +40.0 +42.0 +37.75 +35.6667 +40.75 +43.0 +43.0 +43.0 +44.75 +53.75 +60.0 +60.0 +51.25 +50.0 +52.0 +51.0 +48.0 +48.0 +48.0 +48.0 +48.0 +42.25 +40.0 +40.0 +40.0 +40.0 +40.0 +40.0 +40.0 +40.0 +40.0 +40.0 +40.0 +40.0 +40.0 +40.0 +40.0 +40.0 +40.0 +40.0 +40.0 +40.0 +40.0 +37.0 +37.0 +37.0 +37.0 +37.0 +37.0 +37.0 +37.0 +37.0 +37.0 +37.0 +37.0 +37.0 +37.0 +37.0 +40.0 +40.0 +40.0 +40.0 +40.0 +40.0 +40.0 +40.0 +41.0 +42.0 +42.0 +42.0 +42.0 +42.0 +42.0 +42.0 +42.0 +42.0 +43.25 +44.0 +44.0 +44.0 +44.0 +44.0 +44.0 +44.5 +45.0 +45.0 +45.0 +45.0 +45.0 +45.0 +45.0 +45.0 +45.0 +45.0 +45.0 +700.0 +700.0 +700.0 +700.0 +700.0 +700.0 +700.0 +700.0 +700.0 +700.0 +700.0 +700.0 +700.0 +700.0 +700.0 +700.0 +700.0 +700.0 +700.0 +700.0 +700.0 +700.0 +775.0 +800.0 +800.0 +800.0 +800.0 +800.0 +800.0 +800.0 +800.0 +712.5 +600.0 +600.0 +600.0 +600.0 +600.0 +600.0 +600.0 +600.0 +600.0 +600.0 +600.0 +600.0 +600.0 +600.0 +600.0 +600.0 +600.0 +600.0 +600.0 +600.0 +600.0 +600.0 +600.0 +600.0 +600.0 +600.0 +600.0 +600.0 +600.0 +600.0 +600.0 +600.0 +600.0 +600.0 +600.0 +600.0 +600.0 +600.0 +13.1667 +14.0 +13.5833 +18.75 +18.25 +23.875 +18.6146 +23.5 +23.0 +23.25 +41.25 +60.0 +60.0 +60.0 +60.0 +60.0 +54.25 +41.25 +40.5 +42.5 +42.75 +40.75 +40.0 +40.0 +40.0 +40.0 +40.0 +40.0 +40.0 +39.5 +31.75 +32.0 +31.0 +30.0 +32.0 +32.5 +35.0 +35.0 +37.25 +41.75 +39.0 +38.0 +38.0 +38.5 +40.75 +41.0 +40.75 +41.0 +43.5 +43.0 +45.0 +45.0 +45.0 +45.0 +44.0 +44.0 +44.0 +45.5 +47.75 +45.75 +48.25 +53.0 +53.75 +54.0 +52.25 +49.5 +49.0 +49.0 +49.0 +49.0 +47.75 +47.5 +47.0 +47.0 +47.0 +47.0 +47.0 +45.25 +44.5 +40.0 +57.675 +56.9 +56.8 +57.175 +57.55 +57.8 +58.1 +57.675 +57.125 +57.575 +57.9 +59.55 +60.425 +60.375 +63.45 +63.9 +64.375 +63.0313 +67.7 +68.65 +68.4 +68.4875 +68.575 +68.875 +68.675 +67.3 +66.35 +65.825 +66.675 +66.7 +66.85 +67.375 +67.95 +67.625 +68.0 +68.15 +68.3 +68.575 +68.5375 +68.2125 +68.625 +69.3 +69.3625 +69.125 +69.175 +69.95 +70.9 +72.05 +72.625 +72.55 +75.15 +75.825 +75.975 +75.65 +75.4 +74.6625 +75.55 +77.375 +79.0875 +80.2875 +80.0125 +78.1875 +78.1875 +78.1875 +78.3875 +78.675 +77.125 +76.625 +76.3167 +76.1 +200.0 +200.0 +200.0 +200.0 +200.0 +200.0 +200.0 +200.0 +200.0 +200.0 +200.0 +200.0 +200.0 +200.0 +200.0 +200.0 +200.0 +200.0 +200.0 +200.0 +200.0 +200.0 +237.5 +250.0 +250.0 +250.0 +250.0 +250.0 +250.0 +250.0 +250.0 +250.0 +250.0 +250.0 +287.5 +300.0 +300.0 +300.0 +300.0 +300.0 +300.0 +250.0 +250.0 +250.0 +262.5 +300.0 +300.0 +270.0 +270.0 +270.0 +270.0 +270.0 +270.0 +270.0 +270.0 +270.0 +270.0 +270.0 +292.5 +300.0 +300.0 +300.0 +300.0 +300.0 +300.0 +300.0 +300.0 +300.0 +287.5 +250.0 +59.0 +59.0 +59.0 +59.0 +59.0 +59.0 +59.0 +59.0 +59.0 +59.0 +59.0 +59.0 +59.0 +59.0 +65.0 +108.25 +106.0 +97.5 +95.0 +95.0 +95.0 +103.0 +119.0 +120.0 +120.0 +124.5 +131.5 +140.6 +144.0 +145.0 +28.25 +25.2 +24.0 +25.0 +25.0 +25.0 +26.2 +29.0 +29.0 +29.0 +29.75 +30.0 +30.0 +30.0 +30.0 +50.0 +50.0 +52.5 +55.0 +55.0 +53.75 +50.0 +50.0 +50.0 +50.0 +50.0 +50.0 +50.0 +50.0 +50.0 +45.0 +45.0 +45.0 +45.0 +45.0 +45.0 +45.0 +45.0 +45.0 +45.0 +45.0 +45.0 +45.0 +45.0 +45.0 +86.0 +87.6 +88.0 +88.0 +88.0 +88.0 +88.0 +88.0 +88.0 +88.0 +88.0 +84.5 +83.0 +83.0 +86.0 +37.25 +35.0 +34.0 +34.0 +34.0 +34.0 +34.0 +34.0 +34.0 +34.5 +34.75 +34.0 +34.0 +34.0 +34.0 +11.5 +13.0 +13.0 +13.0 +13.0 +13.0 +13.0 +13.0 +13.0 +13.0 +13.0 +13.0 +13.0 +13.0 +13.0 +92.0 +92.0 +92.0 +92.0 +92.0 +92.0 +92.4 +94.0 +94.0 +94.0 +94.0 +94.0 +94.0 +94.0 +94.0 +600.0 +600.0 +600.0 +600.0 +600.0 +600.0 +600.0 +600.0 +600.0 +600.0 +600.0 +600.0 +600.0 +600.0 +600.0 +36.5 +37.0 +37.0 +37.0 +37.0 +37.0 +37.0 +37.0 +37.0 +40.0 +41.25 +45.0 +45.0 +46.6667 +55.0 +76.25 +77.0 +77.0 +77.0 +76.6667 +77.0 +77.0 +77.0 +77.0 +77.0 +77.25 +77.75 +77.6 +79.0 +80.25 +250.0 +300.0 +300.0 +300.0 +300.0 +300.0 +300.0 +300.0 +300.0 +300.0 +300.0 +300.0 +300.0 +300.0 +300.0 +34.0 +32.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +31.0 +56.0 +56.0 +56.0 +56.0 +56.0 +56.0 +56.0 +56.0 +56.0 +56.0 +56.0 +56.0 +56.0 +56.0 +56.0 +90.0 +90.0 +90.0 +90.0 +90.0 +96.0 +95.4 +97.0 +97.75 +99.75 +142.25 +133.75 +123.4 +110.0 +111.25 +38.0 +37.8 +37.0 +37.0 +36.0 +33.75 +34.6 +36.0 +36.0 +36.0 +34.5 +33.0 +31.8 +31.0 +31.0 +47.0 +46.8 +46.0 +46.0 +48.0 +50.0 +50.0 +50.0 +50.0 +50.0 +50.0 +50.0 +50.0 +50.0 +50.0 +57.0 +56.4 +54.0 +54.0 +54.0 +54.75 +56.2 +56.0 +55.25 +55.0 +55.75 +63.75 +63.2 +60.0 +60.0 +83.0 +81.2 +82.0 +82.6 +88.3333 +90.0 +89.2 +91.0 +92.0 +94.5 +102.0 +102.0 +99.2 +88.0 +87.0 +38.0 +36.4 +36.0 +35.8 +34.3333 +33.5 +34.6 +34.6667 +34.0 +34.25 +34.75 +34.0 +34.0 +34.0 +34.25 +8.0 +8.0 +8.0 +8.0 +8.3333 +9.75 +10.0 +10.0 +10.5 +12.0 +12.0 +12.0 +12.0 +12.0 +12.0 +70.0 +70.0 +70.0 +70.0 +72.6667 +75.5 +76.4 +78.0 +78.0 +79.0 +80.5 +80.75 +82.0 +82.0 +82.75 +550.0 +580.0 +600.0 +600.0 +600.0 +600.0 +600.0 +600.0 +600.0 +600.0 +600.0 +600.0 +520.0 +500.0 +500.0 +38.0 +37.2 +36.5 +35.6 +34.6667 +34.75 +35.0 +34.0 +35.75 +42.0 +43.5 +42.0 +41.4 +42.0 +54.75 +76.0 +77.0 +77.0 +77.0 +77.0 +77.0 +77.0 +77.0 +77.0 +77.25 +77.5 +77.75 +78.2 +79.0 +80.25 +287.5 +270.0 +300.0 +300.0 +300.0 +300.0 +290.0 +250.0 +250.0 +250.0 +300.0 +350.0 +350.0 +316.6667 +300.0 +35.0 +34.2 +34.0 +34.0 +34.0 +32.75 +34.2 +33.6667 +33.0 +33.0 +33.0 +33.0 +32.2 +32.0 +32.0 +52.0 +52.0 +52.0 +52.8 +53.0 +53.0 +53.0 diff --git a/data/samples/gov10.csv b/data/samples/gov10.csv new file mode 100644 index 0000000..3a77a43 --- /dev/null +++ b/data/samples/gov10.csv @@ -0,0 +1,1024 @@ +4005.0 +921.0 +42004.0 +4005.0 +6408.0 +0.0 +0.0 +1805.6 +5064.86 +241297.18 +120027.66 +41434.5 +353231.36 +147491.76 +22074.5 +359278.13 +125516.59 +29107.94 +18446.71 +30863.56 +894000.05 +990203.96 +58550.16 +288417.04 +148053.48 +585111.92 +228888.07 +19714.38 +1044297.19 +1348673.99 +43254.34 +257924.38 +261726.4 +1000000.0 +0.0 +0.0 +-5772.85 +434982.24 +78752.8 +45874.08 +43689.6 +32767.2 +40710.6 +67100.6 +51690.0 +-4996.42 +68590.7 +597046.61 +356813.99 +0.0 +0.0 +0.0 +188464.87 +0.0 +395923.0 +0.0 +-107600.4 +1452168.0 +-298464.0 +298463.11 +-84952.0 +84951.7 +0.0 +11962.18 +13198.5 +18868.98 +3973.68 +0.0 +0.0 +29410.79 +55466.09 +-3814.56 +42431.5 +74282.46 +36569.9 +51316.5 +182728.8 +54535.46 +14259.0 +3764.74 +33709.5 +455751.31 +0.0 +4544391.68 +0.0 +416077.79 +0.0 +0.0 +31425.0 +20950.0 +10457.5 +10450.0 +466118.12 +3338691.58 +921043.0 +0.0 +0.0 +0.0 +2867433.34 +394704.95 +0.0 +0.0 +0.0 +0.0 +1476137.04 +0.0 +0.0 +0.0 +3156.0 +4896.44 +4326.8 +24213.0 +3117.8 +5380.83 +3598.8 +3500.0 +3658.0 +4018.0 +4858.0 +-1287.79 +12500.0 +1000.0 +10000.0 +0.0 +1225.0 +150.0 +5000.0 +0.0 +7500.0 +7000.0 +0.0 +0.0 +10000.0 +10000.0 +10000.0 +3500.0 +5000.0 +5000.0 +0.0 +0.0 +0.0 +0.0 +4000.0 +-525.0 +-3875.0 +-3775.0 +-4120.0 +-50.0 +-680.0 +2300.0 +0.0 +38080.0 +3400.0 +360.0 +680.0 +18748.0 +-24210.8 +20160.0 +25500.0 +30000.0 +30000.0 +25375.0 +-7120.0 +-6291.04 +5950.0 +17000.0 +16000.0 +17000.0 +25000.0 +15000.0 +17000.0 +26200.0 +48000.0 +11000.0 +-1050.0 +45000.0 +45000.0 +24000.0 +-13125.75 +-5000.0 +-29174.0 +-4357.0 +-21640.0 +21640.0 +-3800.0 +0.0 +42500.0 +39480.0 +61315.0 +-1250.0 +-4700.0 +107525.0 +-3100.0 +91100.0 +30000.0 +-9835.0 +-22160.0 +-22870.0 +-9440.0 +-19814.0 +-4514.0 +-650.0 +-3725.0 +-1300.0 +-6760.0 +-7520.0 +-11040.0 +-13000.0 +-19208.0 +-27362.0 +-5650.0 +-3825.0 +-3740.0 +20764.08 +20133.03 +57760.37 +53251.06 +59584.1 +93000.0 +12963.0 +106902.0 +12963.0 +35932.0 +11873.0 +11873.0 +21197.0 +10783.0 +33979.0 +12963.0 +12963.0 +12418.0 +18689.0 +11173.0 +11873.0 +3872.0 +-884.2 +0.0 +18000.0 +20000.0 +644.36 +252.15 +7498.58 +3654.09 +1224.84 +2062.34 +3092.89 +915.84 +463.02 +0.0 +29330.0 +29330.0 +99900.0 +0.0 +0.0 +0.0 +6600.0 +0.0 +0.0 +0.0 +0.0 +20000.0 +7310.83 +5539.24 +24972.0 +8455.15 +0.0 +15.0 +3023.25 +3400.0 +3119.5 +4614.4 +8515.0 +6279.0 +3094.0 +2984.8 +10578.0 +2919.2 +7052.0 +0.0 +5987.9 +3849.85 +6164.64 +0.0 +0.0 +0.0 +0.0 +4377.45 +6862.8 +6470.67 +3855.6 +6085.46 +19926.88 +7068.6 +21186.58 +5622.48 +9692.68 +3044.07 +6898.8 +3224.0 +9936.0 +6624.0 +4283.0 +19500.0 +0.0 +3090.0 +7560.0 +8118.0 +8748.0 +3860.0 +14634.7 +7455.0 +16615.7 +14800.0 +8568.0 +24051.0 +12019.4 +9376.1 +19305.0 +6300.0 +22632.5 +3910.5 +10143.0 +3600.0 +14904.0 +22790.32 +9300.0 +12747.2 +9312.0 +24824.0 +24079.18 +6423.0 +39000.0 +-1505.0 +9240.0 +5984.0 +13690.0 +6132.0 +3660.0 +10872.0 +8907.96 +3192.0 +9664.0 +5520.0 +3659.28 +19962.0 +9702.0 +9579.96 +7782.27 +8493.8 +7331.44 +19650.8 +5774.14 +18567.75 +18423.78 +4049.15 +4049.15 +3232.75 +52500.0 +-81000.0 +23086.0 +87086.56 +0.0 +0.0 +54000.0 +46198.0 +56000.0 +7719.0 +40000.0 +15592.0 +68400.0 +0.0 +62550.0 +59625.0 +6306.4 +3153.2 +10440.0 +3360.0 +13920.0 +3640.0 +7800.0 +3552.0 +3552.0 +3552.0 +10800.0 +10800.0 +4320.0 +3800.0 +10400.0 +6720.0 +3400.0 +71923.25 +59261.5 +63000.0 +26906.5 +25800.0 +31350.0 +52500.0 +250.0 +28575.0 +0.0 +24562.78 +12428.0 +24610.56 +24856.0 +24610.56 +14913.6 +15381.6 +15000.0 +5925.0 +15000.0 +30000.0 +35000.0 +35000.0 +4500.0 +8000.0 +25000.0 +8000.0 +4500.0 +25000.0 +28000.0 +0.0 +7000.0 +7000.0 +10000.0 +7000.0 +20000.0 +3000.0 +3000.0 +270.0 +35000.0 +35000.0 +11390.32 +27949.0 +158725.89 +102339.0 +36000.0 +12307.68 +3401.0 +15395.28 +4718.48 +0.0 +6248.52 +12156.11 +6919.92 +5838.39 +0.0 +8908.0 +8330.0 +13362.0 +3833.46 +9649.2 +8963.25 +17753.1 +7405.2 +10674.3 +6925.8 +4131.0 +6171.0 +3478.2 +3234.0 +0.0 +7374.6 +17166.6 +2318.8 +0.0 +3977.71 +22223.25 +22298.9 +5854.8 +11832.0 +-1642.2 +7068.6 +7475.75 +4896.0 +20652.0 +30736.48 +29443.0 +4850.08 +159716.32 +10808.0 +299970.0 +68856.0 +45356.0 +12447.35 +42929.85 +19256.92 +21059.96 +0.0 +3569.32 +5388.0 +0.0 +0.0 +4057.88 +4887.36 +2749.92 +0.0 +3239.76 +4633.6 +0.0 +4621.48 +6930.34 +8916.4 +8688.1 +4902.52 +5388.0 +12209.36 +23866.79 +3965.76 +3542.88 +8575.84 +11735.36 +3124.8 +11058.32 +11247.8 +6000.0 +3932.4 +12380.14 +0.0 +5000.0 +5000.0 +5000.0 +-67.6 +-608.4 +2590.0 +2590.0 +2590.0 +1056.0 +1540.0 +1408.0 +1840.0 +2355.0 +2082.0 +2235.0 +2011.39 +2595.0 +3600.0 +3600.0 +3600.0 +1413333.33 +3214292.74 +3430000.0 +4233000.0 +3278088.79 +0.0 +6624370.14 +3472.0 +7332.11 +4758.87 +3503.24 +3375.0 +19535.72 +1062.1 +982.73 +2160.97 +1155.71 +2101.33 +1337.49 +339.98 +765.53 +480.31 +31.94 +688.3 +242.74 +979.44 +1280.15 +1946.68 +2344.03 +25460.41 +758.69 +2644.76 +3259.8 +2828.69 +1941.07 +135.0 +65.16 +1285.04 +20.89 +1083.41 +1280.55 +3111.12 +23.07 +752.57 +1035.05 +1297.34 +834.36 +2863.31 +205.7 +8.52 +5000.0 +4000.0 +4000.0 +4000.0 +4000.0 +3500.0 +4000.0 +4000.0 +4000.0 +5000.0 +2000.0 +5000.0 +4000.0 +5000.0 +3500.0 +3500.0 +4000.0 +4000.0 +24300.0 +163027.34 +24300.0 +50795.66 +14477.48 +2450.0 +2250.0 +6600.0 +2200.0 +15000.0 +4000.0 +8000.0 +8000.0 +2500.0 +2450.0 +2450.0 +16000.0 +20000.0 +4500.0 +4500.0 +16500.0 +18000.0 +129894.63 +2800.0 +18000.0 +4000.0 +4000.0 +4500.0 +500.0 +2250.0 +2250.0 +4000.0 +16000.0 +2450.0 +2800.0 +3000.0 +726.0 +726.0 +17551.24 +18726.55 +2200.0 +4500.0 +2345.38 +208633.36 +726.0 +10500.0 +726.0 +4000.0 +4000.0 +726.0 +726.0 +5500.0 +20000.0 +6500.0 +2000.0 +2000.0 +500.0 +17000.0 +5500.0 +3500.0 +39086.85 +39086.85 +39086.85 +39086.85 +460000.0 +108000.0 +360000.0 +500.0 +500.0 +500.0 +500.0 +500.0 +500.0 +500.0 +500.0 +800.0 +500.0 +285.0 +26000.0 +75000.0 +889.05 +12000.0 +5080.0 +14092.82 +26500.0 +60000.0 +26500.0 +80000.0 +80.65 +1000.0 +2416.1 +9938.54 +16404.88 +43720.72 +600.0 +75000.0 +125000.0 +75000.0 +2062.5 +3052.5 +6000.0 +4600.0 +2500.0 +1200.0 +4000.0 +1500.0 +2000.0 +4000.0 +4100.0 +4100.0 +4100.0 +4100.0 +2000.0 +3000.0 +3000.0 +2000.0 +2000.0 +3000.0 +4400.0 +2000.0 +5000.0 +3000.0 +2000.0 +4000.0 +5000.0 +6000.0 +900.0 +200.0 +250.0 +550.0 +1000.0 +200.0 +200.0 +1200.0 +1000.0 +2000.0 +1500.0 +2537.06 +2000.0 +1000.0 +3000.0 +2300.0 +4500.0 +21000.0 +2500.0 +4000.0 +4400.0 +4100.0 +10000.0 +3000.0 +2000.0 +12000.0 +425.0 +1500.0 +1000.0 +300.0 +800.0 +1500.0 +1462.94 +300.0 +500.0 +1000.0 +200.0 +1400.0 +400.0 +4400.0 +4100.0 +4600.0 +300.0 +4600.0 +5000.0 +4000.0 +10000.0 +500.0 +5000.0 +3000.0 +13000.0 +1000.0 +1500.0 +500.0 +450.0 +1000.0 +1000.0 +1500.0 +4000.0 +2500.0 +1200.0 +1200.0 +1200.0 +1200.0 +4100.0 +4100.0 +4400.0 +4100.0 +4100.0 +4100.0 +2100.0 +2100.0 +5000.0 +2000.0 +1500.0 +500.0 +2000.0 +1000.0 +1000.0 +1500.0 +250.0 +250.0 +125.26 +30000.0 +2800.0 +180000.0 +6190.0 +6240.0 +9059.0 +44646.1 +300500.0 +1498212.51 +179.48 +125074.0 +-125074.0 +-11016.5 +-50697.97 +-50697.97 +44592.0 +288038.33 +567556.76 +-12156.0 +-26049.85 +-50500.0 +35298.1 +257566.81 +654414.51 +673.09 +-20548.09 +-7671.41 +-32915.75 +20000.0 +15000.0 +9500.0 +12000.0 +40000.0 +35424.41 +226.6 +-12660.37 +31925.34 +4111.67 +17659.7 +2295.8 +2885.48 +9500.0 +1298.33 +20000.0 +15000.0 +4560.72 +12000.0 +-10187.12 +40000.0 +23585.0 +15696.17 +14091.67 +-30528.84 +-4111.67 +-1469.93 +26825.49 +558.7 +413.2 +2148.3 +75.9 +-12455.25 +10000.0 +1508.65 +-2660.32 +-21724.99 +20000.0 +15000.0 +9500.0 +12000.0 +40000.0 +2836.98 +-23585.0 +-11953.27 +24786.14 +23528.3 +11300.0 +3410.9 +8254.6 +-6299.92 +20000.0 +15000.0 +169.19 +9500.0 +12000.0 +1169.6 +40000.0 +3244.54 +-10846.49 +-4733.29 +6068.25 +-15000.0 +91.65 +7556.3 +-6900.2 +-6967.14 +-4767.32 +-10094.98 +-18379.15 +50.35 +1290.59 +1133.4 +-387.0 +15.4 +-347.1 +16.91 +-2924.36 +-5066.89 +-8000.0 +-5501.88 +-1887.51 +-4500.0 +-1000.0 +-8000.0 +-11000.0 +-8000.0 +48999.0 +55050.0 +91060.0 +10000.0 +30000.0 +6465.0 +6864.56 +14.65 +8000.0 +4000.0 +4000.0 +4000.0 +5000.0 +8000.0 +8000.0 +10000.0 +8000.0 +8000.0 +12000.0 +8000.0 +12000.0 +5000.0 +2397.8 +3100.0 +8000.0 +10000.0 +5000.0 +12000.0 +34775.0 +21073.2 +3000.0 +8000.0 +20000.0 +5000.0 +8070.0 +203.0 +5237.6 +4000.0 +11090.0 +5000.0 +50000.0 +20000.0 +20000.0 +-20000.0 +2500.0 +3000.0 +45000.0 +45000.0 +500000.0 +209595.27 +15000.0 +15000.0 +0.0 +45000.0 +500000.0 +15000.0 +50000.0 +115.68 +617.69 +10000.0 +30000.0 +95295.28 +2000.0 +14588.68 +-10000.0 +400000.0 +180000.0 +4033.25 +4731.95 +3932.32 +17397.66 +10707.58 +4618.9 +6148.9 +4323.1 +7301.5 +81925.66 +63561.21 +0.0 +3000.0 +29367.5 +669.74 +1978.83 +243.1 +271.38 +2625.66 +0.0 +448.68 +1058.0 +2040.0 +13500.0 +90.0 +116000.0 +1021.51 diff --git a/data/samples/gov26.csv b/data/samples/gov26.csv new file mode 100644 index 0000000..d5bdc3b --- /dev/null +++ b/data/samples/gov26.csv @@ -0,0 +1,1024 @@ +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 diff --git a/data/samples/gov30.csv b/data/samples/gov30.csv new file mode 100644 index 0000000..f81d529 --- /dev/null +++ b/data/samples/gov30.csv @@ -0,0 +1,1024 @@ +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +60.4 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +288.75 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +37.5 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +53751.6 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 diff --git a/data/samples/gov31.csv b/data/samples/gov31.csv new file mode 100644 index 0000000..866f4f8 --- /dev/null +++ b/data/samples/gov31.csv @@ -0,0 +1,1024 @@ +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +18473.76 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 diff --git a/data/samples/gov40.csv b/data/samples/gov40.csv new file mode 100644 index 0000000..c802930 --- /dev/null +++ b/data/samples/gov40.csv @@ -0,0 +1,1024 @@ +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +9600.0 +125712.0 +324000.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 diff --git a/data/samples/medicare1.csv b/data/samples/medicare1.csv new file mode 100644 index 0000000..5f8b828 --- /dev/null +++ b/data/samples/medicare1.csv @@ -0,0 +1,1024 @@ +35.389090909 +4.21 +4.39 +48.9498 +52.270229885 +65.119444444 +70.5 +47.96 +49.39 +76.12 +72.33 +74.78 +9.1475 +43.38 +173.318 +48.08 +40.29 +39.670307692 +40.29 +9.81 +9.81 +9.81 +9.81 +3.05 +3.05 +3.05 +3.05 +3.05 +3.05 +2.8 +3.05 +3.05 +3.05 +3.05 +3.05 +3.05 +57.21 +67.36 +68.3 +58.433529412 +60.782783505 +91.5045 +86.848181818 +88.611232877 +90.638787879 +86.325 +92.08 +91.107323944 +92.08 +89.031689498 +92.08 +43.665 +43.056 +38.420833333 +46.304 +45.801428571 +43.914545455 +46.553846154 +44.596666667 +45.396470588 +46.304 +44.252631579 +46.146111111 +46.13 +39.425882353 +40.168888889 +45.116586103 +44.1 +59.645 +53.6725 +55.805322581 +45.887468354 +51.1775 +54.466363636 +50.642857143 +55.934210526 +51.448695652 +50.61 +50.121923077 +50.223529412 +48.44859375 +50.928 +48.168823529 +48.743297872 +49.510222222 +38.59 +36.703858268 +38.1434 +37.085174129 +37.593192771 +37.887307692 +36.718823529 +37.876756757 +36.296851852 +38.511923077 +37.501626506 +39.492222222 +37.179767141 +38.277692308 +38.235771812 +43.685217391 +44.02 +83.62 +83.24 +83.62 +83.221904762 +83.268739496 +8.34 +8.34 +8.8986956522 +8.6907692308 +8.34 +9.6 +9.6 +8.2689090909 +8.96 +8.96 +8.69 +9.21 +9.21 +8.69 +8.69 +9.1291666667 +8.96 +8.4485714286 +8.96 +8.96 +8.69 +9.3625 +9.28 +9.17 +9.17 +8.69 +8.96 +8.69 +8.69 +8.69 +8.69 +8.69 +8.96 +8.69 +8.69 +8.49 +8.69 +9.61 +8.69 +8.96 +8.69 +8.96 +8.69 +11.748387097 +11.215247525 +11.93 +11.09 +12.33 +12.33 +10.722743363 +11.59 +11.59 +11.2 +11.766037736 +11.704 +11.2 +11.2 +11.76 +11.2 +11.59 +11.59 +11.59 +11.59 +11.2 +11.59 +11.899333333 +11.93 +11.88 +11.88 +11.59 +11.2 +12.29 +11.2 +11.2 +11.079102564 +11.2 +11.2 +11.2 +11.2 +11.2 +11.59 +11.2 +11.2 +11.12 +11.159527897 +12.35 +11.2 +11.59 +11.2 +11.59 +11.59 +11.59 +11.93 +11.914615385 +11.59 +11.2 +15.71 +14.35 +14.35 +14.35 +14.35 +11.646363636 +12.93 +14.14 +14.14 +13.68 +13.68 +13.68 +13.68 +83.266901408 +79.63 +82.784 +79.98 +83.62 +83.62 +83.216135266 +83.62 +83.225660377 +82.882647059 +84.482631579 +83.62 +85.884666667 +87.34 +85.92004386 +86.727943925 +86.914307692 +86.191052632 +83.25 +83.636696429 +83.382692308 +86.456705202 +86.820238095 +87.023623188 +86.708263666 +87.34 +86.23 +86.466121212 +86.628333333 +86.483921569 +96.82 +89.541666667 +96.82 +92.13 +93.083529412 +96.82 +96.82 +98.77 +94.62 +105.64 +105.64 +105.64 +83.96 +80.309565217 +53.724 +54.82 +51.7845 +53.45 +52.712307692 +51.395 +54.272 +53.875172414 +38.981333333 +36.462777778 +42.617142857 +42.473636364 +38.7825 +46.366571429 +36.652142857 +38.553076923 +37.055789474 +50.59 +60.418571429 +68.2175 +80.730769231 +67.634444444 +43.812857143 +48.392978723 +47.426346154 +49.163235294 +44.79 +49.949090909 +43.751016949 +47.326296296 +47.096571429 +47.879036145 +47.337045455 +51.79 +50.927333333 +55.704545455 +42.97 +42.97 +42.97 +42.97 +34.51 +34.51 +34.51 +25.71 +25.39 +25.39 +24.19 +25.39 +24.685135135 +25.39 +25.39 +12.86 +12.86 +12.86 +12.33 +12.86 +12.507735849 +12.742666667 +12.86 +12.86 +12.948 +12.86 +13.01 +12.86 +15.23 +15.23 +15.23 +15.215555556 +14.63 +15.23 +15.23 +15.23 +15.23 +15.23 +36.54 +43.98 +56.55 +7.79 +7.79 +7.79 +7.79 +16.73 +17.61 +17.61 +39.9352 +33.52 +56.522857143 +56.3996 +55.995405405 +46.565652174 +65.67173913 +70.092777778 +68.89030303 +67.78 +68.213870968 +68.397735849 +68.989411765 +68.399230769 +107.99 +107.99 +102.014375 +94.495 +65.298947368 +78.21 +78.21 +113.41 +111.22923077 +113.41 +107.94 +111.58096774 +111.91789474 +112.98045455 +107.99 +64.968333333 +66.255833333 +60.165 +66.503529412 +67.85 +66.136153846 +68.27 +71.0325 +70.988529412 +106.49277778 +116.76307692 +107.99 +104.85285714 +103.08272727 +73.682727273 +77.19 +72.07 +73.332 +75.261 +55.89 +46.04 +52.44 +52.44 +52.44 +76.14 +76.14 +76.14 +76.14 +117.57 +117.57 +117.57 +118.61 +118.61 +62.66 +61.011052632 +77.54 +26.74 +33.85 +33.85 +34.51 +34.51 +32.83 +34.51 +34.51 +34.51 +34.51 +34.51 +37.58 +32.5 +34.356 +33.172 +35.030277778 +33.89 +35.032571429 +34.553333333 +13.494897959 +14.14 +13.68 +14.939193548 +14.14 +9.6333333333 +9.43 +9.42 +8.91 +9.79 +9.9116666667 +9.7545454545 +9.43 +9.43 +9.43 +9.43 +9.43 +9.43 +9.43 +9.43 +9.43 +9.43 +10.43 +9.79 +9.8657142857 +9.43 +9.42 +10.41 +10.41 +9.3438461538 +9.79 +9.79 +9.79 +9.94 +9.9108571429 +9.43 +9.43 +9.79 +9.79 +9.79 +9.79 +9.79 +10.010444444 +10.06 +10.014705882 +9.79 +9.43 +10.55 +9.43 +9.43 +9.43 +9.43 +9.43 +9.43 +9.43 +9.43 +9.79 +9.43 +9.43 +9.42 +9.43 +10.43 +9.43 +9.79 +9.43 +9.79 +9.79 +10.06 +9.79 +9.43 +10.059607843 +9.78 +9.76 +10.579807692 +10.78 +9.3027102804 +10.14 +10.14 +10.3 +9.78 +9.78 +10.271111111 +9.78 +10.14 +10.14 +10.14 +10.14 +9.78 +10.273714286 +10.42 +10.4 +9.78 +10.89 +9.78 +9.78 +9.78 +9.78 +9.78 +10.14 +9.78 +9.78 +9.78 +10.79 +9.78 +10.76 +10.14 +10.42 +10.14 +11.91 +12.71 +11.74 +12.34 +11.91 +11.91 +12.34 +12.34 +12.771578947 +12.65 +11.91 +12.98 +11.91 +11.91 +11.91 +11.91 +11.91 +11.81 +11.91 +13.18 +11.91 +12.34 +12.71 +11.91 +11.8764 +11.554174757 +12.29 +11.43 +12.7 +12.7 +11.142765957 +11.94 +11.94 +11.55 +12.135714286 +12.044117647 +11.55 +11.55 +12.109583333 +11.55 +11.94 +11.94 +11.94 +11.94 +11.55 +11.94 +12.02245614 +12.29 +12.23 +11.211666667 +11.94 +11.55 +12.63 +11.55 +11.55 +11.55 +11.55 +11.402051282 +11.55 +11.55 +11.565744681 +11.94 +11.55 +11.55 +11.55 +11.46 +11.55 +12.72 +11.55 +12.72 +12.68 +11.94 +11.55 +11.94 +12.29 +12.29 +11.55 +16.359411765 +15.79 +15.42 +14.983786408 +16.29 +16.29 +15.79 +16.65 +16.65 +15.79 +16.592666667 +16.29 +16.29 +16.29 +15.79 +16.86375 +15.79 +16.81 +15.79 +15.79 +16.29 +15.79 +15.79 +16.29 +16.29 +16.29 +15.79 +8.93 +8.69 +8.34 +8.260625 +8.96 +9.21 +8.69 +8.69 +8.69 +8.96 +8.96 +8.96 +8.69 +9.28 +8.69 +8.96 +8.69 +8.69 +8.69 +8.69 +8.69 +8.69 +8.96 +8.4833333333 +8.69 +9.61 +8.69 +8.69 +11.43 +11.55 +10.66 +10.638636364 +11.84 +11.17 +11.17 +11.17 +11.4 +11.17 +11.17 +11.17 +11.51 +11.17 +12.36 +10.8 +11.11 +10.8 +10.8 +10.8 +10.8 +12.59 +9.3424324324 +9.0756976744 +9.63 +9.1 +9.95 +9.95 +8.6417241379 +9.4 +9.4 +9.07 +9.5039285714 +9.4726315789 +9.07 +9.07 +9.52 +9.07 +9.4 +9.4 +9.4 +9.4 +9.4 +9.07 +9.4 +9.5767857143 +9.63 +9.63 +9.63 +9.94 +9.4 +9.07 +10.2 +9.07 +9.07 +9.07 +9.07 +9.07 +9.07 +9.07 +9.07 +9.07 +9.07 +9.4 +9.07 +9.07 +9.07 +9.07 +9.97 +9.07 +9.4 +9.07 +9.94 +9.4 +9.4 +9.63 +9.63 +9.07 +8.21 +8.69 +11.09 +11.2 +11.2 +11.2 +12.35 +11.17 +10.636666667 +11.51 +11.84 +11.84 +11.84 +11.51 +11.17 +11.17 +11.51 +12.614375 +13.28 +13.66 +13.28 +13.28 +14.16 +7.86 +7.63 +7.63 +7.63 +10.161538462 +9.7847524752 +10.42 +9.76 +10.78 +10.78 +9.3389189189 +10.14 +10.14 +9.78 +10.287 +10.192941176 +9.78 +9.78 +10.3 +9.78 +10.14 +10.14 +10.14 +10.14 +10.14 +9.78 +10.14 +10.433272727 +10.42 +10.4 +10.335 +10.14 +9.78 +10.89 +9.78 +9.5680555556 +9.78 +9.78 +9.78 +9.78 +9.78 +10.14 +9.78 +9.78 +9.6677941176 +10.79 +9.78 +10.76 +10.14 +9.78 +10.14 +10.14 +10.42 +10.42 +10.14 +9.78 +13.0 +22.0 +23.21 +23.9 +8.57 +7.35 +8.41 +8.0312195122 +7.63 +7.63 +7.63 +7.63 +8.42 +7.63 +11.912666667 +11.55 +11.43 +12.7 +11.94 +11.94 +12.15 +11.55 +11.55 +11.94 +11.94 +11.94 +11.55 +12.63 +11.55 +11.55 +11.55 +11.55 +11.602857143 +11.55 +11.94 +11.55 +11.55 +12.72 +11.94 +11.55 +11.55 +16.47 +9.385 +9.07 +9.1 +8.74 +9.4 +9.4 +9.4 +9.07 +9.07 +9.07 +9.4 +9.4 +9.4 +9.7442857143 +9.63 +9.2773333333 +9.07 +10.2 +9.07 +9.07 +9.07 +8.8789473684 +9.07 +9.4 +9.07 +9.07 +9.07 +9.97 +9.1314285714 +9.07 +9.63 +9.07 +6.25 +8.69 +8.34 +8.264 +8.69 +8.69 +8.69 +8.69 +8.96 +8.69 +8.69 +8.69 +9.61 +32.23 +34.01 +35.01 +16.35 +15.79 +14.9628 +14.992181818 +16.29 +16.65 +16.527142857 +15.79 +16.65 +15.79 +16.424285714 +16.81 +15.79 +16.29 +15.79 +15.79 +17.39 +15.79 +18.27 +9.1145 +9.09 +9.09 +9.09 +9.09 +9.09 +8.93 +8.6932432432 +8.34 +9.6 +9.6 +8.2686956522 +8.96 +8.96 +8.69 +9.1753333333 +9.1488235294 +8.69 +8.69 +8.96 +8.96 +8.96 +8.96 +8.69 +9.2562962963 +9.28 +9.17 +9.1639130435 +8.69 +8.96 +8.69 +8.69 +8.69 +8.69 +8.69 +8.96 +8.69 +8.69 +8.49 +8.69 +9.61 +8.69 +8.96 +8.96 +9.28 +9.28 +8.69 +26.75 +26.75 +27.54 +26.75 +26.528571429 +33.83 +34.79 +34.79 +46.12 +34.673265306 +38.29 +86.263068182 +116.8 +114.26 +110.9 +114.93 +110.9 +117.86 +114.26 +220.94142857 +234.51 +227.29 +230.92703704 +220.69 +220.69 +227.29 +222.73740741 +234.51 +227.29 +234.51 +220.69 +220.69 +220.69 +231.82121212 +234.51 +227.29 +220.69 +227.29 +241.82 +227.29 +220.69 +220.69 +220.69 diff --git a/data/samples/medicare9.csv b/data/samples/medicare9.csv new file mode 100644 index 0000000..11110db --- /dev/null +++ b/data/samples/medicare9.csv @@ -0,0 +1,1024 @@ +77.0 +232.0 +222.0 +50.0 +87.0 +18.0 +33.0 +104.0 +29.0 +190.0 +18.0 +18.0 +36.0 +13.0 +20.0 +18.0 +14.0 +65.0 +69.0 +24.0 +307.0 +456.0 +24.0 +12.0 +241.0 +13.0 +58.0 +136.0 +666.0 +22.0 +24.0 +23.0 +47.0 +1385.0 +16.0 +28.0 +16.0 +12.0 +143.0 +17.0 +97.0 +40.0 +22.0 +73.0 +33.0 +12.0 +47.0 +71.0 +54.0 +219.0 +25.0 +24.0 +15.0 +72.0 +60.0 +21.0 +44.0 +39.0 +12.0 +51.0 +30.0 +19.0 +108.0 +14.0 +17.0 +27.0 +331.0 +21.0 +14.0 +12.0 +62.0 +79.0 +24.0 +33.0 +21.0 +38.0 +23.0 +26.0 +26.0 +17.0 +64.0 +55.0 +51.0 +94.0 +1080.0 +15.0 +127.0 +250.0 +201.0 +166.0 +26.0 +170.0 +37.0 +216.0 +260.0 +166.0 +18.0 +773.0 +26.0 +298.0 +23.0 +22.0 +12.0 +55.0 +71.0 +210.0 +119.0 +46.0 +71.0 +23.0 +39.0 +35.0 +23.0 +17.0 +55.0 +40.0 +12.0 +13.0 +26.0 +25.0 +73.0 +27.0 +12.0 +22.0 +14.0 +18.0 +24.0 +34.0 +16.0 +20.0 +14.0 +17.0 +27.0 +38.0 +74.0 +25.0 +13.0 +13.0 +45.0 +59.0 +12.0 +31.0 +12.0 +72.0 +78.0 +21.0 +15.0 +12.0 +19.0 +29.0 +31.0 +202.0 +45.0 +174.0 +65.0 +35.0 +113.0 +32.0 +49.0 +47.0 +53.0 +50.0 +165.0 +33.0 +39.0 +12.0 +73.0 +29.0 +48.0 +50.0 +57.0 +31.0 +60.0 +25.0 +19.0 +59.0 +31.0 +97.0 +95.0 +86.0 +16.0 +78.0 +62.0 +47.0 +35.0 +236.0 +23.0 +76.0 +32.0 +52.0 +34.0 +233.0 +339.0 +51.0 +29.0 +25.0 +51.0 +33.0 +13.0 +32.0 +39.0 +27.0 +73.0 +13.0 +14.0 +17.0 +15.0 +15.0 +11.0 +13.0 +11.0 +17.0 +31.0 +12.0 +40.0 +12.0 +71.0 +62.0 +50.0 +53.0 +84.0 +38.0 +207.0 +17.0 +53.0 +68.0 +57.0 +48.0 +45.0 +63.0 +228.0 +107.0 +65.0 +19.0 +13.0 +112.0 +104.0 +173.0 +84.0 +69.0 +311.0 +18.0 +118.0 +165.0 +702.0 +51.0 +17.0 +12.0 +44.0 +37.0 +17.0 +12.0 +13.0 +38.0 +11.0 +14.0 +15.0 +27.0 +18.0 +23.0 +25.0 +13.0 +20.0 +20.0 +13.0 +12.0 +25.0 +29.0 +15.0 +18.0 +14.0 +11.0 +36.0 +35.0 +14.0 +13.0 +19.0 +235.0 +14.0 +20.0 +13.0 +27.0 +21.0 +47.0 +52.0 +34.0 +18.0 +11.0 +59.0 +27.0 +35.0 +166.0 +44.0 +16.0 +15.0 +11.0 +11.0 +131.0 +12.0 +91.0 +22.0 +110.0 +288.0 +16.0 +17.0 +15.0 +14.0 +15.0 +37.0 +131.0 +24.0 +14.0 +48.0 +14.0 +13.0 +21.0 +53.0 +75.0 +29.0 +20.0 +20.0 +19.0 +11.0 +12.0 +12.0 +30.0 +23.0 +36.0 +43.0 +19.0 +30.0 +23.0 +15.0 +16.0 +35.0 +28.0 +12.0 +29.0 +22.0 +21.0 +24.0 +15.0 +12.0 +18.0 +25.0 +16.0 +14.0 +25.0 +37.0 +23.0 +46.0 +36.0 +33.0 +19.0 +31.0 +53.0 +68.0 +13.0 +50.0 +33.0 +48.0 +84.0 +19.0 +14.0 +37.0 +15.0 +13.0 +17.0 +17.0 +31.0 +38.0 +66.0 +11.0 +12.0 +24.0 +12.0 +17.0 +28.0 +26.0 +46.0 +24.0 +68.0 +18.0 +13.0 +37.0 +21.0 +44.0 +11.0 +19.0 +14.0 +20.0 +20.0 +22.0 +34.0 +19.0 +33.0 +64.0 +16.0 +31.0 +33.0 +40.0 +12.0 +276.0 +70.0 +27.0 +19.0 +12.0 +19.0 +15.0 +19.0 +58.0 +50.0 +155.0 +666.0 +22.0 +24.0 +23.0 +48.0 +1499.0 +28.0 +13.0 +19.0 +15.0 +15.0 +36.0 +16.0 +35.0 +27.0 +49.0 +20.0 +41.0 +62.0 +17.0 +18.0 +25.0 +47.0 +21.0 +51.0 +18.0 +11.0 +13.0 +23.0 +16.0 +87.0 +12.0 +12.0 +11.0 +11.0 +12.0 +26.0 +15.0 +11.0 +21.0 +95.0 +123.0 +70.0 +33.0 +26.0 +12.0 +23.0 +22.0 +42.0 +35.0 +20.0 +28.0 +22.0 +29.0 +24.0 +18.0 +26.0 +45.0 +26.0 +34.0 +35.0 +63.0 +80.0 +60.0 +31.0 +131.0 +20.0 +15.0 +226.0 +19.0 +24.0 +116.0 +15.0 +12.0 +15.0 +196.0 +187.0 +16.0 +21.0 +13.0 +45.0 +38.0 +17.0 +12.0 +13.0 +51.0 +99.0 +56.0 +52.0 +19.0 +107.0 +73.0 +31.0 +20.0 +15.0 +46.0 +27.0 +14.0 +50.0 +19.0 +48.0 +27.0 +73.0 +35.0 +31.0 +18.0 +65.0 +106.0 +39.0 +18.0 +29.0 +124.0 +31.0 +25.0 +21.0 +18.0 +114.0 +228.0 +15.0 +16.0 +28.0 +17.0 +23.0 +32.0 +29.0 +76.0 +15.0 +24.0 +200.0 +34.0 +31.0 +19.0 +25.0 +18.0 +31.0 +113.0 +12.0 +72.0 +12.0 +22.0 +22.0 +21.0 +258.0 +33.0 +12.0 +29.0 +52.0 +25.0 +103.0 +23.0 +98.0 +64.0 +22.0 +47.0 +57.0 +24.0 +40.0 +42.0 +34.0 +53.0 +48.0 +24.0 +28.0 +37.0 +26.0 +34.0 +23.0 +32.0 +15.0 +57.0 +17.0 +12.0 +12.0 +23.0 +43.0 +75.0 +20.0 +14.0 +63.0 +79.0 +39.0 +64.0 +36.0 +47.0 +24.0 +17.0 +35.0 +40.0 +17.0 +188.0 +265.0 +24.0 +11.0 +19.0 +23.0 +27.0 +13.0 +37.0 +31.0 +50.0 +17.0 +32.0 +51.0 +103.0 +35.0 +11.0 +12.0 +22.0 +17.0 +129.0 +15.0 +25.0 +17.0 +15.0 +33.0 +16.0 +21.0 +11.0 +142.0 +21.0 +90.0 +19.0 +30.0 +11.0 +15.0 +21.0 +13.0 +16.0 +20.0 +48.0 +32.0 +14.0 +15.0 +24.0 +15.0 +17.0 +13.0 +13.0 +13.0 +16.0 +19.0 +23.0 +50.0 +14.0 +30.0 +20.0 +26.0 +13.0 +69.0 +19.0 +21.0 +44.0 +28.0 +12.0 +16.0 +12.0 +14.0 +14.0 +22.0 +13.0 +24.0 +11.0 +12.0 +22.0 +25.0 +12.0 +22.0 +29.0 +58.0 +17.0 +12.0 +17.0 +25.0 +18.0 +39.0 +13.0 +11.0 +37.0 +86.0 +43.0 +178.0 +40.0 +23.0 +87.0 +91.0 +30.0 +25.0 +56.0 +38.0 +181.0 +26.0 +30.0 +29.0 +66.0 +24.0 +42.0 +61.0 +16.0 +20.0 +35.0 +56.0 +13.0 +29.0 +53.0 +12.0 +25.0 +34.0 +101.0 +73.0 +12.0 +54.0 +104.0 +28.0 +129.0 +11.0 +18.0 +19.0 +13.0 +24.0 +11.0 +18.0 +29.0 +162.0 +275.0 +32.0 +32.0 +40.0 +15.0 +13.0 +12.0 +52.0 +40.0 +30.0 +12.0 +21.0 +15.0 +14.0 +24.0 +16.0 +14.0 +25.0 +15.0 +18.0 +16.0 +13.0 +14.0 +44.0 +12.0 +11.0 +21.0 +32.0 +35.0 +17.0 +20.0 +16.0 +14.0 +16.0 +11.0 +51.0 +24.0 +39.0 +101.0 +21.0 +52.0 +46.0 +14.0 +74.0 +67.0 +30.0 +27.0 +40.0 +34.0 +162.0 +37.0 +24.0 +19.0 +48.0 +19.0 +41.0 +34.0 +13.0 +59.0 +31.0 +55.0 +26.0 +16.0 +24.0 +24.0 +70.0 +79.0 +66.0 +36.0 +22.0 +32.0 +23.0 +104.0 +18.0 +51.0 +23.0 +21.0 +136.0 +307.0 +37.0 +19.0 +22.0 +14.0 +24.0 +14.0 +27.0 +26.0 +18.0 +62.0 +30.0 +17.0 +36.0 +15.0 +14.0 +51.0 +12.0 +41.0 +24.0 +40.0 +48.0 +49.0 +12.0 +13.0 +15.0 +20.0 +41.0 +15.0 +20.0 +17.0 +22.0 +25.0 +12.0 +11.0 +12.0 +12.0 +20.0 +20.0 +13.0 +26.0 +40.0 +16.0 +14.0 +12.0 +12.0 +12.0 +87.0 +104.0 +12.0 +12.0 +18.0 +281.0 +16.0 +31.0 +83.0 +24.0 +14.0 +13.0 +19.0 +19.0 +25.0 +18.0 +19.0 +15.0 +11.0 +14.0 +14.0 +15.0 +21.0 +39.0 +13.0 +29.0 +17.0 +19.0 +107.0 +21.0 +13.0 +12.0 +57.0 +50.0 +14.0 +12.0 +12.0 +21.0 +11.0 +14.0 +35.0 +20.0 +15.0 +14.0 +11.0 +26.0 +26.0 +13.0 +27.0 +24.0 +19.0 +19.0 +11.0 +13.0 +11.0 +31.0 +25.0 +55.0 +58.0 +11.0 +14.0 +70.0 +16.0 +19.0 +14.0 +12.0 +69.0 +35.0 +13.0 +51.0 +13.0 +11.0 +15.0 +20.0 +21.0 +15.0 +32.0 +20.0 +36.0 +16.0 +37.0 +34.0 +17.0 +22.0 +46.0 +33.0 +14.0 +15.0 +15.0 +17.0 +60.0 +18.0 +24.0 +19.0 +19.0 +23.0 +29.0 +27.0 +17.0 +14.0 +23.0 +29.0 +50.0 +79.0 +20.0 +12.0 +55.0 +11.0 +27.0 +18.0 +23.0 +15.0 +72.0 +102.0 +14.0 +13.0 +13.0 +11.0 +17.0 +23.0 +35.0 +12.0 +16.0 +12.0 +14.0 +13.0 +11.0 +18.0 +25.0 +49.0 +40.0 +88.0 +30.0 +53.0 +22.0 +24.0 +13.0 +21.0 +23.0 +21.0 +55.0 +13.0 +27.0 +60.0 +19.0 +14.0 +27.0 +11.0 +150.0 +77.0 +24.0 +72.0 +45.0 +66.0 +23.0 +41.0 +44.0 +24.0 +71.0 +40.0 +56.0 +29.0 +11.0 diff --git a/data/samples/neon_air_pressure.csv b/data/samples/neon_air_pressure.csv new file mode 100644 index 0000000..3e3a243 --- /dev/null +++ b/data/samples/neon_air_pressure.csv @@ -0,0 +1,1024 @@ +97.85764 +97.85864 +97.85947 +97.8598 +97.86063 +97.8603 +97.86063 +97.8608 +97.8603 +97.8578 +97.85664 +97.85714 +97.85897 +97.8613 +97.8638 +97.86647 +97.86847 +97.86864 +97.86914 +97.86964 +97.8708 +97.8723 +97.8718 +97.87064 +97.86964 +97.87014 +97.8688 +97.86964 +97.87114 +97.87147 +97.8708 +97.8698 +97.8678 +97.8663 +97.8658 +97.86363 +97.86297 +97.86113 +97.8588 +97.85547 +97.85447 +97.85597 +97.85914 +97.86197 +97.86263 +97.86097 +97.85797 +97.85614 +97.85564 +97.8568 +97.85947 +97.86313 +97.8673 +97.87147 +97.87264 +97.87197 +97.8698 +97.86764 +97.86663 +97.8693 +97.8723 +97.8758 +97.87747 +97.8778 +97.87797 +97.87763 +97.87713 +97.87563 +97.87563 +97.8768 +97.8773 +97.87863 +97.8788 +97.87963 +97.87946 +97.87913 +97.87863 +97.87913 +97.8803 +97.88214 +97.88414 +97.88697 +97.89047 +97.89547 +97.89847 +97.9008 +97.90313 +97.9053 +97.90846 +97.91113 +97.91263 +97.91197 +97.91113 +97.90963 +97.90696 +97.90363 +97.90097 +97.90047 +97.89963 +97.89963 +97.90063 +97.90346 +97.90763 +97.91047 +97.91263 +97.91413 +97.9148 +97.91746 +97.91863 +97.9178 +97.91596 +97.91513 +97.91347 +97.91363 +97.91663 +97.92046 +97.9193 +97.9193 +97.92046 +97.91896 +97.9198 +97.92096 +97.92129 +97.92129 +97.9223 +97.92313 +97.92229 +97.92113 +97.9188 +97.91796 +97.9203 +97.92129 +97.92313 +97.9248 +97.9273 +97.93046 +97.93379 +97.93796 +97.9403 +97.94263 +97.9458 +97.94796 +97.95013 +97.95247 +97.95363 +97.95363 +97.95546 +97.9578 +97.95929 +97.96146 +97.96329 +97.96596 +97.96646 +97.96796 +97.96996 +97.97263 +97.97429 +97.97579 +97.97662 +97.97596 +97.97413 +97.97296 +97.97163 +97.97179 +97.97296 +97.97479 +97.97746 +97.97946 +97.98113 +97.98246 +97.98346 +97.98346 +97.98113 +97.9793 +97.97963 +97.9828 +97.98629 +97.98846 +97.98879 +97.98979 +97.99163 +97.99446 +97.99663 +97.99829 +98.00096 +98.00329 +98.00529 +98.0073 +98.00713 +98.00663 +98.00663 +98.00663 +98.0068 +98.00646 +98.00496 +98.00412 +98.00362 +98.00279 +98.00129 +97.99963 +97.99913 +98.00113 +98.00312 +98.00379 +98.00462 +98.00396 +98.00379 +98.00312 +98.00329 +98.00362 +98.00522 +98.00746 +98.00946 +98.01146 +98.01263 +98.01363 +98.01363 +98.01379 +98.01412 +98.01529 +98.01629 +98.01712 +98.01829 +98.02046 +98.02296 +98.02479 +98.02563 +98.02479 +98.02529 +98.02563 +98.02596 +98.02579 +98.02463 +98.02563 +98.02812 +98.03212 +98.03479 +98.03646 +98.03912 +98.04079 +98.04112 +98.04196 +98.04196 +98.04179 +98.04112 +98.03863 +98.03663 +98.03412 +98.03229 +98.03229 +98.03429 +98.03729 +98.03979 +98.04096 +98.04179 +98.04212 +98.04379 +98.04529 +98.04562 +98.04445 +98.04312 +98.04246 +98.04212 +98.04329 +98.04412 +98.04396 +98.04279 +98.04162 +98.04196 +98.04412 +98.04712 +98.04946 +98.04913 +98.04712 +98.04612 +98.04562 +98.04479 +98.04379 +98.04429 +98.04595 +98.04796 +98.04846 +98.04762 +98.04745 +98.04829 +98.04979 +98.05146 +98.05212 +98.05179 +98.05162 +98.05079 +98.05063 +98.05129 +98.05262 +98.05196 +98.04946 +98.04779 +98.04796 +98.04896 +98.04996 +98.05129 +98.05162 +98.05063 +98.04896 +98.04796 +98.04662 +98.04512 +98.04362 +98.04329 +98.04479 +98.04612 +98.04562 +98.04462 +98.04545 +98.04779 +98.05079 +98.05279 +98.05296 +98.05196 +98.05096 +98.05063 +98.05013 +98.04929 +98.04829 +98.04779 +98.04745 +98.04662 +98.04662 +98.04762 +98.04913 +98.05162 +98.05462 +98.05712 +98.05929 +98.06129 +98.06313 +98.06412 +98.06446 +98.06429 +98.06546 +98.06829 +98.07062 +98.07312 +98.07512 +98.07696 +98.07829 +98.07896 +98.07996 +98.08062 +98.08212 +98.08379 +98.08529 +98.08712 +98.08912 +98.09146 +98.09412 +98.09695 +98.09862 +98.09879 +98.09879 +98.09845 +98.09695 +98.09429 +98.09179 +98.09062 +98.09096 +98.09212 +98.09379 +98.09529 +98.09645 +98.09562 +98.09312 +98.09196 +98.09229 +98.09412 +98.09579 +98.09579 +98.09495 +98.09362 +98.09229 +98.09162 +98.09096 +98.09046 +98.08962 +98.08946 +98.08879 +98.08745 +98.08679 +98.08645 +98.08562 +98.08562 +98.08479 +98.08295 +98.08079 +98.07962 +98.07979 +98.08195 +98.08545 +98.08879 +98.09079 +98.09246 +98.09429 +98.09462 +98.09346 +98.09029 +98.08745 +98.08512 +98.08279 +98.08129 +98.08179 +98.08362 +98.08562 +98.08728 +98.08762 +98.08712 +98.08629 +98.08479 +98.08362 +98.08329 +98.08395 +98.08595 +98.08845 +98.09079 +98.09296 +98.09346 +98.09229 +98.09079 +98.09012 +98.08996 +98.08979 +98.09062 +98.09146 +98.09279 +98.09479 +98.09762 +98.10078 +98.10412 +98.10712 +98.10829 +98.10662 +98.10329 +98.10095 +98.10012 +98.10195 +98.10479 +98.10729 +98.10929 +98.11062 +98.11179 +98.11262 +98.11395 +98.11528 +98.11645 +98.11729 +98.11762 +98.11762 +98.11729 +98.11562 +98.11378 +98.11212 +98.11029 +98.10912 +98.10879 +98.11012 +98.11145 +98.10995 +98.10795 +98.10629 +98.10562 +98.10562 +98.10579 +98.10695 +98.10879 +98.11095 +98.11262 +98.11212 +98.11129 +98.11179 +98.11328 +98.11345 +98.11129 +98.10812 +98.10379 +98.10012 +98.09862 +98.09895 +98.10145 +98.10529 +98.10895 +98.11095 +98.11095 +98.10862 +98.10679 +98.10579 +98.10562 +98.10546 +98.10712 +98.10929 +98.11129 +98.11162 +98.11112 +98.11112 +98.11195 +98.11345 +98.11512 +98.11662 +98.11829 +98.12095 +98.12495 +98.12978 +98.13379 +98.13529 +98.13462 +98.13345 +98.13095 +98.12962 +98.13028 +98.13162 +98.13412 +98.13629 +98.13778 +98.13928 +98.14062 +98.14278 +98.14478 +98.14629 +98.14562 +98.14378 +98.14262 +98.14428 +98.14779 +98.15162 +98.15362 +98.15295 +98.15145 +98.14945 +98.14779 +98.14762 +98.14712 +98.14629 +98.14529 +98.14428 +98.14361 +98.14328 +98.14262 +98.14262 +98.14212 +98.14262 +98.14295 +98.14311 +98.14262 +98.14212 +98.14128 +98.14062 +98.14062 +98.14062 +98.14178 +98.14295 +98.14428 +98.14545 +98.14729 +98.14995 +98.15212 +98.15445 +98.15695 +98.15945 +98.16079 +98.16112 +98.16162 +98.16162 +98.16245 +98.16228 +98.16112 +98.15979 +98.15861 +98.15861 +98.15861 +98.15861 +98.15861 +98.15895 +98.15995 +98.16129 +98.16212 +98.16312 +98.16378 +98.16445 +98.16395 +98.16262 +98.16129 +98.15979 +98.15962 +98.16079 +98.16145 +98.16295 +98.16412 +98.16428 +98.16345 +98.16228 +98.16079 +98.15912 +98.15861 +98.15912 +98.16012 +98.16062 +98.16062 +98.16012 +98.15928 +98.15878 +98.15861 +98.15861 +98.15711 +98.15545 +98.15295 +98.15062 +98.14895 +98.14695 +98.14629 +98.14662 +98.14829 +98.15012 +98.15195 +98.15328 +98.15445 +98.15428 +98.15212 +98.14978 +98.14762 +98.14612 +98.14512 +98.14461 +98.14495 +98.14562 +98.14662 +98.14812 +98.14862 +98.14812 +98.14762 +98.14762 +98.14795 +98.14862 +98.14912 +98.14962 +98.14929 +98.14862 +98.14962 +98.14962 +98.15045 +98.14962 +98.14962 +98.14962 +98.14895 +98.14862 +98.15012 +98.15295 +98.15595 +98.15861 +98.15962 +98.15962 +98.15861 +98.15745 +98.15628 +98.15561 +98.15645 +98.15761 +98.15761 +98.15645 +98.15462 +98.15345 +98.15262 +98.15328 +98.15395 +98.15495 +98.15561 +98.15678 +98.15778 +98.15861 +98.15928 +98.15962 +98.15962 +98.15895 +98.15778 +98.15578 +98.15462 +98.15462 +98.15545 +98.15661 +98.15795 +98.15828 +98.15778 +98.15761 +98.15761 +98.15761 +98.15761 +98.15645 +98.15578 +98.15545 +98.15478 +98.15495 +98.15478 +98.15462 +98.15395 +98.15178 +98.14978 +98.14862 +98.14862 +98.14879 +98.14962 +98.14978 +98.14962 +98.14895 +98.14795 +98.14679 +98.14662 +98.14645 +98.14729 +98.14829 +98.14862 +98.14745 +98.14612 +98.14478 +98.14461 +98.14361 +98.14295 +98.14328 +98.14495 +98.14762 +98.15012 +98.15162 +98.15212 +98.15145 +98.15012 +98.14812 +98.14762 +98.14845 +98.15012 +98.15128 +98.15162 +98.15112 +98.15078 +98.15028 +98.14945 +98.14745 +98.14562 +98.14411 +98.14262 +98.14328 +98.14395 +98.14512 +98.14662 +98.14712 +98.14812 +98.14829 +98.14745 +98.14679 +98.14662 +98.14729 +98.14879 +98.15128 +98.15462 +98.15761 +98.16012 +98.16079 +98.16045 +98.15979 +98.16045 +98.16179 +98.16395 +98.16662 +98.16895 +98.17095 +98.17261 +98.17362 +98.17462 +98.17462 +98.17462 +98.17495 +98.17612 +98.17862 +98.18145 +98.18361 +98.18561 +98.18862 +98.19145 +98.19345 +98.19445 +98.19495 +98.19561 +98.19661 +98.19678 +98.19761 +98.19828 +98.19995 +98.20112 +98.20078 +98.20062 +98.20178 +98.20245 +98.20145 +98.20045 +98.20011 +98.20062 +98.20045 +98.19928 +98.19711 +98.19461 +98.19212 +98.19195 +98.19445 +98.19711 +98.19911 +98.19995 +98.20095 +98.20128 +98.20062 +98.20062 +98.20145 +98.20162 +98.20162 +98.20178 +98.20162 +98.20162 +98.20228 +98.20212 +98.20262 +98.20162 +98.20112 +98.20128 +98.20212 +98.20428 +98.20645 +98.20861 +98.21011 +98.21178 +98.21294 +98.21411 +98.21512 +98.21545 +98.21562 +98.21562 +98.21578 +98.21728 +98.21895 +98.22095 +98.22311 +98.22478 +98.22611 +98.22661 +98.22594 +98.22411 +98.22211 +98.22078 +98.21911 +98.21678 +98.21378 +98.20928 +98.20445 +98.20145 +98.19995 +98.19844 +98.19778 +98.19795 +98.20095 +98.20378 +98.20395 +98.20162 +98.19961 +98.19861 +98.19894 +98.20062 +98.20278 +98.20495 +98.20645 +98.20761 +98.20728 +98.20595 +98.20462 +98.20462 +98.20462 +98.20462 +98.20412 +98.20278 +98.20095 +98.19995 +98.19928 +98.19861 +98.19761 +98.19811 +98.19894 +98.19961 +98.20011 +98.19894 +98.19778 +98.19661 +98.19578 +98.19528 +98.19561 +98.19695 +98.19761 +98.19778 +98.19795 +98.19795 +98.19861 +98.19828 +98.19761 +98.19645 +98.19561 +98.19561 +98.19561 +98.19561 +98.19578 +98.19561 +98.19561 +98.19561 +98.19511 +98.19428 +98.19378 +98.19361 +98.19345 +98.19345 +98.19261 +98.19261 +98.19261 +98.19261 +98.19261 +98.19378 +98.19495 +98.19628 +98.19728 +98.19761 +98.19678 +98.19611 +98.19511 +98.19361 +98.19228 +98.18995 +98.18795 +98.18795 +98.19095 +98.19561 +98.19995 +98.20245 +98.20212 +98.19878 +98.19445 +98.18962 +98.18595 +98.18378 +98.18261 +98.18195 +98.18228 +98.18328 +98.18411 +98.18511 +98.18628 +98.18711 +98.18778 +98.18812 +98.18762 +98.18711 +98.18695 +98.18762 +98.18795 +98.18862 +98.18862 +98.18945 +98.19062 +98.19128 +98.19212 +98.19162 +98.19045 +98.18878 +98.18912 +98.19095 +98.19395 +98.19728 +98.20012 +98.20162 +98.20212 +98.20162 +98.20212 +98.20295 +98.20478 +98.20761 +98.20995 +98.21178 +98.21328 +98.21361 +98.21361 +98.21361 +98.21445 +98.21662 +98.21961 +98.22361 +98.22711 +98.22912 +98.23028 +98.23128 +98.23178 +98.23195 +98.23178 +98.23195 +98.23245 +98.23361 +98.23511 +98.23811 +98.24111 +98.24411 +98.24561 +98.24661 +98.24828 +98.25061 +98.25278 +98.25361 +98.25344 +98.25278 +98.25261 +98.25261 +98.25311 +98.25377 +98.25511 +98.25661 +98.25761 +98.25861 +98.25861 diff --git a/data/samples/neon_bio_temp_c.csv b/data/samples/neon_bio_temp_c.csv new file mode 100644 index 0000000..8bee614 --- /dev/null +++ b/data/samples/neon_bio_temp_c.csv @@ -0,0 +1,1024 @@ +12.47 +12.49 +12.72 +12.81 +12.58 +12.29 +12.18 +11.77 +11.42 +11.25 +11.0 +10.84 +10.75 +10.7 +10.75 +10.91 +10.94 +10.85 +10.61 +10.44 +10.49 +10.5 +10.7 +11.09 +11.26 +11.3 +11.4 +11.25 +11.32 +11.26 +11.39 +11.54 +11.62 +11.53 +11.33 +10.98 +10.71 +10.46 +10.4 +10.47 +10.71 +10.88 +11.0 +11.05 +10.99 +10.82 +10.84 +10.88 +11.11 +11.33 +11.49 +11.62 +11.62 +11.59 +11.63 +11.71 +11.87 +12.01 +11.99 +11.91 +11.63 +11.86 +12.09 +11.89 +11.62 +11.0 +10.99 +11.07 +11.25 +11.37 +11.56 +11.59 +11.24 +11.31 +11.47 +10.58 +10.17 +10.03 +9.92 +9.76 +9.65 +9.61 +9.61 +9.62 +9.6 +9.56 +9.47 +9.43 +9.51 +9.48 +9.41 +9.37 +9.39 +9.37 +9.3 +9.27 +9.25 +9.21 +9.22 +9.16 +9.12 +9.07 +9.04 +8.99 +8.91 +8.91 +8.88 +8.84 +8.8 +8.73 +8.67 +8.57 +8.58 +8.61 +8.59 +8.54 +8.56 +8.53 +8.48 +8.52 +8.53 +8.46 +8.43 +8.43 +8.41 +8.39 +8.38 +8.35 +8.34 +8.31 +8.29 +8.27 +8.27 +8.24 +8.26 +8.26 +8.27 +8.23 +8.18 +8.14 +8.16 +8.14 +8.12 +8.09 +8.06 +8.06 +8.06 +8.03 +8.05 +8.01 +7.98 +8.0 +7.97 +7.99 +7.98 +7.96 +7.99 +8.13 +8.19 +8.23 +8.23 +8.19 +8.18 +8.21 +8.22 +8.24 +8.29 +8.35 +8.36 +8.42 +8.45 +8.47 +8.48 +8.5 +8.48 +8.45 +8.3 +8.23 +8.22 +8.21 +8.15 +8.14 +8.14 +8.16 +8.21 +8.19 +8.17 +8.15 +8.06 +8.09 +8.11 +8.09 +8.04 +8.08 +8.07 +8.08 +8.08 +8.1 +8.08 +8.02 +8.0 +7.93 +7.86 +7.89 +7.93 +7.93 +7.84 +7.77 +7.67 +7.55 +7.38 +7.37 +7.32 +7.25 +7.18 +7.1 +7.11 +7.15 +7.15 +7.13 +7.15 +7.19 +7.2 +7.16 +7.18 +7.17 +7.21 +7.15 +7.1 +7.06 +6.97 +7.07 +7.07 +7.06 +6.98 +6.94 +6.93 +6.85 +6.85 +6.82 +6.87 +6.89 +6.84 +6.83 +6.86 +6.84 +6.84 +6.84 +6.85 +6.9 +6.89 +6.89 +6.92 +6.9 +6.95 +6.97 +6.96 +6.99 +7.05 +7.13 +7.13 +7.17 +7.19 +7.22 +7.23 +7.21 +7.19 +7.23 +7.18 +7.14 +7.17 +7.2 +7.23 +7.27 +7.22 +7.16 +7.08 +7.16 +7.11 +7.08 +7.13 +7.16 +7.14 +7.05 +7.07 +7.12 +7.17 +7.2 +7.21 +7.14 +7.16 +7.13 +7.12 +7.09 +7.13 +7.19 +7.22 +7.24 +7.26 +7.23 +7.22 +7.21 +7.24 +7.24 +7.22 +7.19 +7.17 +7.06 +7.01 +7.01 +6.94 +6.97 +6.98 +6.97 +6.94 +6.89 +6.88 +6.89 +6.9 +6.86 +6.83 +6.78 +6.76 +6.78 +6.74 +6.71 +6.63 +6.57 +6.46 +6.44 +6.45 +6.44 +6.45 +6.44 +6.41 +6.43 +6.45 +6.46 +6.52 +6.52 +6.55 +6.57 +6.49 +6.44 +6.41 +6.44 +6.38 +6.41 +6.43 +6.44 +6.43 +6.45 +6.46 +6.47 +6.46 +6.5 +6.41 +6.4 +6.44 +6.46 +6.49 +6.43 +6.44 +6.38 +6.3 +6.27 +6.23 +6.13 +6.13 +6.09 +6.01 +5.94 +5.92 +5.86 +5.79 +5.75 +5.72 +5.68 +5.63 +5.61 +5.56 +5.55 +5.52 +5.58 +5.5 +5.51 +5.45 +5.39 +5.4 +5.35 +5.34 +5.39 +5.39 +5.42 +5.36 +5.26 +5.29 +5.24 +5.29 +5.3 +5.35 +5.26 +5.26 +5.19 +5.15 +5.15 +5.14 +5.09 +5.04 +5.03 +4.97 +4.98 +5.02 +5.02 +4.99 +4.98 +4.95 +4.95 +4.95 +4.9 +4.88 +4.84 +4.79 +4.78 +4.8 +4.85 +4.86 +4.82 +4.76 +4.72 +4.7 +4.78 +4.81 +4.72 +4.69 +4.61 +4.67 +4.64 +4.68 +4.6 +4.56 +4.61 +4.58 +4.61 +4.65 +4.71 +4.74 +4.68 +4.6 +4.63 +4.75 +4.67 +4.67 +4.73 +4.69 +4.7 +4.73 +4.81 +4.85 +4.86 +4.88 +4.86 +4.81 +4.79 +4.87 +4.9 +4.93 +4.96 +4.93 +5.01 +5.01 +5.08 +5.05 +4.93 +5.01 +5.03 +5.04 +5.1 +5.04 +4.96 +5.0 +5.06 +5.1 +5.08 +5.06 +5.03 +5.02 +4.98 +4.99 +4.99 +4.92 +4.92 +4.87 +4.85 +4.75 +4.8 +4.84 +4.75 +4.7 +4.62 +4.7 +4.77 +4.68 +4.67 +4.67 +4.63 +4.63 +4.71 +4.74 +4.71 +4.73 +4.76 +4.84 +4.84 +4.77 +4.85 +4.88 +4.87 +4.9 +4.91 +4.88 +4.87 +4.91 +4.82 +4.79 +4.83 +4.83 +4.85 +4.87 +4.83 +4.83 +4.87 +4.94 +5.03 +5.02 +4.95 +4.98 +4.98 +4.88 +4.91 +4.96 +4.99 +5.01 +5.01 +5.04 +5.02 +5.09 +5.11 +5.09 +5.03 +5.05 +5.17 +5.23 +5.24 +5.14 +5.18 +5.23 +5.21 +5.18 +5.19 +5.27 +5.32 +5.31 +5.32 +5.26 +5.3 +5.34 +5.35 +5.4 +5.41 +5.42 +5.4 +5.47 +5.53 +5.56 +5.49 +5.44 +5.51 +5.54 +5.58 +5.57 +5.55 +5.53 +5.54 +5.55 +5.55 +5.56 +5.57 +5.56 +5.52 +5.43 +5.38 +5.34 +5.38 +5.4 +5.39 +5.38 +5.39 +5.4 +5.45 +5.51 +5.51 +5.51 +5.51 +5.48 +5.51 +5.56 +5.6 +5.65 +5.64 +5.57 +5.63 +5.67 +5.65 +5.65 +5.65 +5.64 +5.62 +5.59 +5.63 +5.61 +5.55 +5.6 +5.6 +5.61 +5.61 +5.62 +5.63 +5.66 +5.68 +5.72 +5.79 +5.78 +5.75 +5.76 +5.73 +5.73 +5.71 +5.76 +5.77 +5.79 +5.74 +5.71 +5.66 +5.71 +5.75 +5.78 +5.76 +5.79 +5.83 +5.83 +5.79 +5.76 +5.86 +5.92 +5.89 +5.88 +5.85 +5.82 +5.81 +5.85 +5.83 +5.79 +5.74 +5.72 +5.76 +5.75 +5.76 +5.78 +5.82 +5.76 +5.66 +5.77 +5.83 +5.82 +5.79 +5.79 +5.8 +5.8 +5.72 +5.73 +5.73 +5.78 +5.72 +5.74 +5.76 +5.75 +5.69 +5.64 +5.72 +5.77 +5.79 +5.82 +5.79 +5.78 +5.85 +5.86 +5.84 +5.84 +5.85 +5.87 +5.83 +5.87 +5.91 +5.94 +5.95 +5.97 +5.94 +5.93 +5.91 +5.94 +5.96 +5.99 +5.97 +5.94 +6.0 +6.05 +6.06 +6.0 +5.97 +5.92 +5.92 +5.89 +5.85 +5.75 +5.76 +5.68 +5.74 +5.79 +5.74 +5.69 +5.73 +5.82 +5.83 +5.82 +5.81 +5.78 +5.8 +5.75 +5.73 +5.74 +5.75 +5.78 +5.79 +5.8 +5.79 +5.77 +5.85 +5.83 +5.86 +5.86 +5.94 +5.87 +5.9 +5.86 +5.76 +5.77 +5.8 +5.79 +5.82 +5.83 +5.88 +5.86 +5.88 +5.91 +5.9 +5.89 +5.9 +5.93 +5.93 +5.94 +5.94 +5.96 +6.01 +5.96 +5.97 +5.97 +5.99 +6.0 +5.98 +6.02 +6.01 +6.0 +6.01 +6.02 +6.05 +6.04 +6.05 +6.02 +6.05 +5.96 +5.99 +5.97 +5.97 +5.98 +5.99 +6.02 +6.03 +6.0 +5.95 +5.93 +5.88 +5.89 +5.89 +5.89 +5.79 +5.77 +5.77 +5.8 +5.84 +5.89 +5.89 +5.86 +5.82 +5.81 +5.85 +5.82 +5.77 +5.78 +5.76 +5.77 +5.76 +5.78 +5.76 +5.72 +5.73 +5.75 +5.78 +5.84 +5.87 +5.83 +5.84 +5.81 +5.82 +5.79 +5.75 +5.77 +5.77 +5.86 +5.89 +5.89 +5.96 +5.96 +5.92 +5.92 +5.94 +5.93 +5.97 +5.97 +5.94 +5.92 +5.96 +5.9 +5.93 +5.96 +5.99 +5.98 +6.0 +5.91 +5.86 +5.82 +5.84 +5.83 +5.84 +5.84 +5.89 +5.87 +5.82 +5.78 +5.79 +5.78 +5.81 +5.79 +5.78 +5.75 +5.73 +5.74 +5.77 +5.8 +5.81 +5.8 +5.75 +5.76 +5.73 +5.74 +5.7 +5.73 +5.76 +5.79 +5.85 +5.84 +5.81 +5.88 +5.88 +5.84 +5.83 +5.8 +5.81 +5.81 +5.79 +5.75 +5.78 +5.81 +5.77 +5.76 +5.75 +5.82 +5.85 +5.87 +5.81 +5.8 +5.83 +5.8 +5.79 +5.87 +5.89 +5.89 +5.88 +5.91 +5.94 +5.92 +5.94 +5.98 +5.97 +5.99 +5.99 +6.0 +5.94 +5.9 +5.89 +5.9 +5.88 +5.89 +5.87 +5.83 +5.81 +5.87 +5.92 +5.92 +5.92 +5.8 +5.81 +5.82 +5.84 +5.85 +5.89 +5.87 +5.87 +5.88 +5.84 +5.81 +5.85 +5.85 +5.86 +5.89 +5.87 +5.91 +5.85 +5.84 +5.88 +5.89 +5.87 +5.84 +5.83 +5.88 +5.9 +5.88 +5.86 +5.91 +5.92 +5.91 +5.88 +5.88 +5.85 +5.89 +5.95 +5.94 +5.98 +5.98 +6.01 +6.02 +6.03 +6.09 +6.15 +6.17 +6.15 +6.15 +6.18 +6.2 +6.26 +6.25 +6.27 +6.29 +6.29 +6.32 +6.34 +6.32 +6.31 +6.32 +6.3 +6.31 +6.22 +6.24 +6.25 +6.24 +6.09 +6.13 +6.19 +6.24 +6.23 +6.22 +6.27 +6.27 +6.3 +6.3 +6.37 +6.44 +6.44 +6.4 +6.38 +6.37 +6.37 +6.35 +6.35 diff --git a/data/samples/neon_dew_point_temp.csv b/data/samples/neon_dew_point_temp.csv new file mode 100644 index 0000000..fca3633 --- /dev/null +++ b/data/samples/neon_dew_point_temp.csv @@ -0,0 +1,1024 @@ +22.441 +22.434 +22.422 +22.398 +22.371 +22.375 +22.376 +22.409 +22.375 +22.322 +22.302 +22.29 +22.268 +22.269 +22.283 +22.292 +22.261 +22.265 +22.244 +22.272 +22.3 +22.307 +22.322 +22.29 +22.245 +22.277 +22.31 +22.312 +22.333 +22.307 +22.291 +22.272 +22.259 +22.284 +22.303 +22.303 +22.277 +22.254 +22.253 +22.312 +22.273 +22.227 +22.176 +22.181 +22.202 +22.189 +22.199 +22.164 +22.192 +22.257 +22.217 +22.209 +22.24 +22.179 +22.188 +22.227 +22.155 +22.161 +22.185 +22.15 +22.169 +22.199 +22.157 +22.108 +22.078 +22.05 +22.089 +22.08 +22.064 +22.097 +22.041 +22.004 +22.053 +22.051 +22.021 +22.027 +22.054 +22.036 +22.073 +22.11 +22.093 +22.093 +22.064 +22.056 +22.05 +22.034 +22.045 +22.083 +22.078 +22.086 +22.106 +22.085 +22.087 +22.089 +22.136 +22.141 +22.181 +22.221 +22.218 +22.229 +22.245 +22.172 +22.164 +22.154 +22.132 +22.167 +22.184 +22.227 +22.208 +22.198 +22.164 +22.212 +22.211 +22.167 +22.146 +22.11 +22.159 +22.121 +22.058 +22.096 +22.119 +22.077 +22.068 +22.096 +22.096 +22.115 +22.118 +22.122 +22.128 +22.125 +22.082 +22.036 +22.038 +22.021 +22.022 +22.006 +22.017 +22.013 +21.987 +21.977 +21.97 +21.933 +21.916 +21.949 +21.951 +22.001 +21.931 +22.021 +21.934 +21.904 +21.929 +21.905 +21.887 +21.858 +21.849 +21.857 +21.854 +21.867 +21.853 +21.798 +21.86 +21.906 +21.875 +21.866 +21.823 +21.781 +21.801 +21.843 +21.81 +21.843 +21.825 +21.794 +21.784 +21.772 +21.74 +21.728 +21.749 +21.791 +21.741 +21.729 +21.705 +21.677 +21.632 +21.634 +21.67 +21.623 +21.614 +21.657 +21.68 +21.634 +21.668 +21.675 +21.644 +21.633 +21.645 +21.642 +21.692 +21.692 +21.689 +21.738 +21.704 +21.653 +21.656 +21.637 +21.61 +21.653 +21.67 +21.642 +21.652 +21.672 +21.703 +21.683 +21.662 +21.613 +21.682 +21.71 +21.689 +21.676 +21.691 +21.685 +21.687 +21.678 +21.665 +21.664 +21.651 +21.616 +21.582 +21.578 +21.606 +21.636 +21.614 +21.577 +21.559 +21.601 +21.627 +21.594 +21.535 +21.541 +21.537 +21.554 +21.564 +21.582 +21.622 +21.587 +21.554 +21.553 +21.557 +21.554 +21.531 +21.508 +21.506 +21.527 +21.573 +21.536 +21.567 +21.548 +21.496 +21.472 +21.449 +21.449 +21.466 +21.582 +21.586 +21.595 +21.544 +21.551 +21.597 +21.625 +21.562 +21.508 +21.508 +21.515 +21.494 +21.489 +21.517 +21.535 +21.581 +21.548 +21.544 +21.565 +21.612 +21.658 +21.675 +21.593 +21.539 +21.495 +21.494 +21.505 +21.485 +21.477 +21.515 +21.491 +21.481 +21.474 +21.473 +21.45 +21.465 +21.471 +21.513 +21.552 +21.584 +21.519 +21.446 +21.409 +21.449 +21.523 +21.51 +21.457 +21.439 +21.444 +21.385 +21.384 +21.334 +21.313 +21.365 +21.37 +21.394 +21.443 +21.383 +21.325 +21.331 +21.308 +21.37 +21.331 +21.33 +21.306 +21.279 +21.342 +21.348 +21.334 +21.401 +21.442 +21.401 +21.41 +21.349 +21.357 +21.435 +21.402 +21.373 +21.378 +21.383 +21.342 +21.286 +21.277 +21.299 +21.295 +21.253 +21.276 +21.319 +21.347 +21.353 +21.307 +21.279 +21.251 +21.269 +21.275 +21.296 +21.304 +21.267 +21.333 +21.298 +21.261 +21.259 +21.307 +21.37 +21.351 +21.336 +21.332 +21.356 +21.438 +21.417 +21.413 +21.389 +21.465 +21.368 +21.35 +21.323 +21.362 +21.377 +21.38 +21.365 +21.322 +21.288 +21.321 +21.421 +21.384 +21.383 +21.38 +21.396 +21.385 +21.547 +21.493 +21.43 +21.407 +21.414 +21.405 +21.414 +21.428 +21.467 +21.535 +21.603 +21.547 +21.523 +21.505 +21.523 +21.551 +21.571 +21.548 +21.551 +21.492 +21.47 +21.491 +21.588 +21.57 +21.531 +21.528 +21.516 +21.546 +21.613 +21.613 +21.703 +21.685 +21.669 +21.65 +21.659 +21.597 +21.613 +21.632 +21.653 +21.623 +21.649 +21.624 +21.646 +21.626 +21.565 +21.588 +21.567 +21.553 +21.539 +21.541 +21.618 +21.638 +21.598 +21.614 +21.644 +21.654 +21.622 +21.689 +21.586 +21.605 +21.598 +21.596 +21.63 +21.663 +21.649 +21.603 +21.603 +21.667 +21.702 +21.71 +21.699 +21.671 +21.592 +21.609 +21.644 +21.616 +21.606 +21.607 +21.575 +21.594 +21.611 +21.58 +21.571 +21.591 +21.607 +21.604 +21.598 +21.641 +21.604 +21.534 +21.548 +21.62 +21.706 +21.684 +21.651 +21.594 +21.563 +21.628 +21.662 +21.658 +21.674 +21.639 +21.657 +21.687 +21.636 +21.6 +21.6 +21.573 +21.632 +21.669 +21.672 +21.654 +21.652 +21.716 +21.812 +21.794 +21.776 +21.812 +21.802 +21.755 +21.73 +21.724 +21.82 +21.875 +21.804 +21.816 +21.883 +21.846 +21.801 +21.86 +21.851 +21.85 +21.809 +21.794 +21.771 +21.77 +21.778 +21.793 +21.792 +21.795 +21.841 +21.84 +21.846 +21.821 +21.854 +21.816 +21.829 +21.812 +21.792 +21.838 +21.866 +21.879 +21.927 +21.876 +21.882 +21.841 +21.848 +21.859 +21.856 +21.844 +21.834 +21.797 +21.819 +21.786 +21.773 +21.763 +21.758 +21.756 +21.784 +21.792 +21.811 +21.768 +21.739 +21.729 +21.747 +21.757 +21.818 +21.806 +21.775 +21.766 +21.727 +21.697 +21.715 +21.746 +21.743 +21.766 +21.765 +21.75 +21.771 +21.779 +21.855 +21.788 +21.77 +21.749 +21.741 +21.797 +21.785 +21.833 +21.828 +21.808 +21.815 +21.823 +21.799 +21.85 +21.813 +21.817 +21.817 +21.834 +21.842 +21.84 +21.84 +21.841 +21.859 +21.877 +21.964 +21.944 +21.939 +21.923 +21.951 +21.983 +21.934 +21.923 +21.927 +21.907 +21.876 +21.946 +21.991 +21.978 +22.03 +22.047 +21.977 +21.947 +21.901 +21.907 +21.945 +21.903 +21.921 +21.91 +21.877 +21.893 +21.91 +21.947 +22.0 +21.951 +21.929 +21.94 +21.919 +21.959 +21.981 +21.972 +21.978 +21.961 +21.927 +21.907 +21.919 +21.944 +22.02 +22.009 +22.001 +21.98 +21.97 +21.976 +21.954 +21.916 +21.916 +21.964 +21.993 +21.944 +21.935 +21.946 +21.955 +21.954 +21.958 +21.955 +21.985 +22.01 +22.007 +21.968 +21.978 +22.0 +22.032 +21.966 +21.918 +21.937 +21.973 +21.994 +21.963 +21.974 +21.982 +22.019 +21.974 +21.955 +21.957 +21.983 +21.983 +22.004 +22.002 +21.944 +21.927 +21.928 +21.899 +21.966 +21.934 +21.934 +21.961 +21.91 +21.896 +21.868 +21.874 +21.88 +21.897 +21.94 +21.932 +21.918 +21.916 +21.914 +21.948 +21.914 +21.943 +21.963 +21.964 +21.971 +21.964 +21.944 +21.993 +22.043 +22.024 +22.015 +22.052 +22.114 +22.11 +22.143 +22.148 +22.159 +22.2 +22.208 +22.207 +22.213 +22.284 +22.31 +22.286 +22.248 +22.215 +22.244 +22.323 +22.347 +22.363 +22.363 +22.398 +22.443 +22.493 +22.547 +22.627 +22.671 +22.659 +22.655 +22.679 +22.703 +22.749 +22.754 +22.685 +22.656 +22.641 +22.722 +22.842 +22.895 +22.853 +22.843 +22.839 +22.907 +22.961 +22.965 +22.976 +22.983 +22.987 +23.04 +23.026 +22.981 +22.962 +22.972 +22.955 +22.899 +22.874 +22.878 +22.892 +22.909 +22.971 +23.009 +23.084 +23.148 +23.21 +23.266 +23.248 +23.261 +23.316 +23.343 +23.29 +23.266 +23.321 +23.318 +23.32 +23.311 +23.333 +23.344 +23.339 +23.377 +23.459 +23.535 +23.597 +23.655 +23.809 +23.901 +23.896 +23.868 +23.803 +23.732 +23.685 +23.678 +23.676 +23.664 +23.703 +23.795 +23.896 +23.91 +23.944 +23.799 +23.739 +23.748 +23.759 +23.758 +23.726 +23.712 +23.748 +23.81 +23.801 +23.8 +23.745 +23.775 +23.818 +23.907 +23.981 +24.024 +24.127 +24.286 +24.343 +24.508 +24.622 +24.612 +24.532 +24.487 +24.384 +24.291 +24.167 +24.14 +24.116 +24.139 +24.203 +24.273 +24.253 +24.212 +24.162 +24.172 +24.203 +24.214 +24.232 +24.266 +24.256 +24.245 +24.244 +24.244 +24.226 +24.226 +24.263 +24.3 +24.323 +24.355 +24.436 +24.489 +24.568 +24.602 +24.543 +24.534 +24.568 +24.643 +24.673 +24.638 +24.663 +24.672 +24.655 +24.667 +24.7 +24.763 +24.748 +24.775 +24.852 +24.934 +24.962 +24.968 +24.995 +24.981 +25.059 +25.181 +25.262 +25.282 +25.279 +25.218 +25.162 +25.106 +25.075 +25.021 +24.981 +24.948 +24.958 +25.01 +25.109 +25.257 +25.355 +25.402 +25.481 +25.389 +25.285 +25.261 +25.226 +25.188 +25.194 +25.202 +25.231 +25.224 +25.164 +25.164 +25.152 +25.149 +25.125 +25.096 +25.086 +25.089 +25.064 +25.01 +24.977 +24.951 +24.933 +24.939 +24.931 +24.955 +24.959 +24.946 +24.94 +24.905 +24.892 +24.904 +24.944 +24.942 +24.947 +24.992 +25.036 +25.068 +25.097 +25.141 +25.171 +25.222 +25.234 +25.244 +25.269 +25.322 +25.389 +25.388 +25.42 +25.473 +25.568 +25.697 +25.798 +25.84 +25.849 +25.81 +25.836 +25.845 +25.845 +25.857 +25.925 +25.997 +25.977 +25.97 +25.967 +26.04 +26.119 +26.144 +26.043 +25.943 +25.888 +25.928 +25.985 +25.984 +25.958 +25.932 +25.918 +25.928 +25.95 +25.928 +25.885 +25.895 +25.915 +25.963 +25.991 +25.938 +25.94 +25.956 +25.997 +26.038 +26.054 +26.073 +26.059 +26.06 +26.074 +26.084 +26.063 +26.064 +26.067 +26.065 +26.071 +26.06 +26.053 +26.045 +26.04 +26.037 +26.01 +25.998 +26.034 +26.072 +26.08 +26.034 +26.03 +26.063 +26.117 +26.168 diff --git a/data/samples/neon_pm10_dust.csv b/data/samples/neon_pm10_dust.csv new file mode 100644 index 0000000..11abafd --- /dev/null +++ b/data/samples/neon_pm10_dust.csv @@ -0,0 +1,1024 @@ +0.016 +0.018 +0.017 +0.017 +0.018 +0.016 +0.014 +0.013 +0.023 +0.016 +0.018 +0.023 +0.017 +0.017 +0.016 +0.017 +0.017 +0.02 +0.021 +0.022 +0.022 +0.021 +0.021 +0.027 +0.029 +0.032 +0.032 +0.028 +0.025 +0.022 +0.017 +0.015 +0.015 +0.013 +0.012 +0.011 +0.011 +0.011 +0.011 +0.011 +0.011 +0.011 +0.011 +0.011 +0.011 +0.013 +0.014 +0.015 +0.015 +0.015 +0.015 +0.015 +0.015 +0.015 +0.016 +0.022 +0.025 +0.022 +0.022 +0.02 +0.019 +0.022 +0.018 +0.018 +0.019 +0.019 +0.017 +0.017 +0.017 +0.018 +0.019 +0.019 +0.017 +0.017 +0.016 +0.015 +0.02 +0.019 +0.015 +0.009 +0.006 +0.007 +0.006 +0.006 +0.005 +0.002 +0.002 +0.002 +0.004 +0.005 +0.006 +0.006 +0.007 +0.008 +0.007 +0.007 +0.007 +0.006 +0.007 +0.007 +0.007 +0.008 +0.008 +0.007 +0.008 +0.008 +0.008 +0.007 +0.007 +0.008 +0.007 +0.007 +0.007 +0.007 +0.007 +0.006 +0.007 +0.008 +0.008 +0.01 +0.01 +0.007 +0.007 +0.007 +0.007 +0.007 +0.007 +0.008 +0.008 +0.009 +0.01 +0.012 +0.012 +0.012 +0.013 +0.013 +0.013 +0.014 +0.014 +0.016 +0.018 +0.017 +0.018 +0.019 +0.015 +0.014 +0.013 +0.012 +0.011 +0.012 +0.012 +0.011 +0.012 +0.012 +0.012 +0.016 +0.019 +0.024 +0.025 +0.027 +0.029 +0.028 +0.029 +0.028 +0.025 +0.028 +0.095 +0.13 +0.142 +0.035 +0.031 +0.027 +0.026 +0.024 +0.021 +0.022 +0.021 +0.019 +0.017 +0.014 +0.013 +0.012 +0.012 +0.012 +0.012 +0.013 +0.012 +0.012 +0.011 +0.01 +0.014 +0.015 +0.014 +0.013 +0.012 +0.011 +0.011 +0.01 +0.01 +0.009 +0.009 +0.009 +0.009 +0.01 +0.011 +0.011 +0.012 +0.012 +0.013 +0.013 +0.013 +0.014 +0.014 +0.015 +0.015 +0.016 +0.016 +0.015 +0.015 +0.016 +0.009 +0.006 +0.004 +0.003 +0.003 +0.005 +0.008 +0.007 +0.005 +0.005 +0.008 +0.012 +0.016 +0.016 +0.017 +0.016 +0.015 +0.014 +0.012 +0.01 +0.009 +0.011 +0.009 +0.008 +0.007 +0.007 +0.009 +0.008 +0.015 +0.017 +0.013 +0.011 +0.012 +0.011 +0.009 +0.01 +0.01 +0.01 +0.011 +0.011 +0.011 +0.01 +0.01 +0.009 +0.009 +0.009 +0.008 +0.007 +0.007 +0.007 +0.007 +0.008 +0.007 +0.006 +0.006 +0.006 +0.007 +0.008 +0.007 +0.007 +0.005 +0.006 +0.007 +0.006 +0.006 +0.006 +0.005 +0.005 +0.004 +0.005 +0.005 +0.004 +0.005 +0.007 +0.009 +0.005 +0.006 +0.005 +0.005 +0.006 +0.005 +0.005 +0.005 +0.005 +0.005 +0.005 +0.005 +0.005 +0.005 +0.005 +0.004 +0.004 +0.004 +0.005 +0.005 +0.005 +0.005 +0.004 +0.004 +0.004 +0.004 +0.004 +0.005 +0.006 +0.006 +0.006 +0.006 +0.005 +0.005 +0.004 +0.005 +0.006 +0.005 +0.006 +0.005 +0.004 +0.004 +0.005 +0.005 +0.004 +0.006 +0.005 +0.004 +0.003 +0.003 +0.002 +0.001 +0.002 +0.002 +0.003 +0.003 +0.004 +0.005 +0.005 +0.005 +0.004 +0.004 +0.004 +0.004 +0.004 +0.004 +0.004 +0.004 +0.005 +0.005 +0.005 +0.006 +0.006 +0.008 +0.006 +0.005 +0.006 +0.007 +0.007 +0.006 +0.006 +0.005 +0.005 +0.005 +0.005 +0.005 +0.005 +0.005 +0.006 +0.006 +0.006 +0.006 +0.005 +0.006 +0.005 +0.005 +0.004 +0.005 +0.005 +0.005 +0.006 +0.006 +0.006 +0.006 +0.007 +0.006 +0.006 +0.007 +0.011 +0.009 +0.007 +0.008 +0.008 +0.009 +0.01 +0.01 +0.01 +0.01 +0.008 +0.007 +0.007 +0.007 +0.006 +0.006 +0.006 +0.005 +0.005 +0.006 +0.006 +0.006 +0.006 +0.006 +0.005 +0.005 +0.005 +0.005 +0.005 +0.005 +0.005 +0.005 +0.005 +0.005 +0.005 +0.005 +0.005 +0.005 +0.005 +0.006 +0.008 +0.01 +0.009 +0.01 +0.009 +0.009 +0.009 +0.009 +0.008 +0.008 +0.008 +0.008 +0.008 +0.008 +0.008 +0.008 +0.008 +0.008 +0.008 +0.007 +0.007 +0.007 +0.008 +0.008 +0.008 +0.008 +0.008 +0.008 +0.008 +0.008 +0.008 +0.008 +0.007 +0.008 +0.008 +0.008 +0.009 +0.009 +0.009 +0.009 +0.009 +0.009 +0.009 +0.008 +0.008 +0.008 +0.009 +0.009 +0.009 +0.009 +0.007 +0.006 +0.006 +0.007 +0.007 +0.007 +0.007 +0.008 +0.008 +0.008 +0.008 +0.009 +0.008 +0.009 +0.008 +0.008 +0.007 +0.008 +0.008 +0.007 +0.007 +0.007 +0.007 +0.007 +0.007 +0.008 +0.009 +0.01 +0.009 +0.009 +0.01 +0.009 +0.009 +0.009 +0.009 +0.009 +0.009 +0.009 +0.009 +0.01 +0.009 +0.009 +0.009 +0.01 +0.01 +0.01 +0.01 +0.01 +0.011 +0.01 +0.01 +0.009 +0.009 +0.008 +0.009 +0.009 +0.01 +0.011 +0.012 +0.014 +0.015 +0.015 +0.015 +0.016 +0.017 +0.016 +0.014 +0.013 +0.012 +0.012 +0.013 +0.012 +0.013 +0.015 +0.015 +0.013 +0.013 +0.012 +0.01 +0.011 +0.011 +0.011 +0.021 +0.013 +0.012 +0.011 +0.01 +0.011 +0.011 +0.011 +0.011 +0.011 +0.011 +0.01 +0.01 +0.01 +0.011 +0.011 +0.011 +0.01 +0.011 +0.011 +0.01 +0.01 +0.009 +0.009 +0.011 +0.011 +0.01 +0.01 +0.01 +0.01 +0.011 +0.011 +0.011 +0.01 +0.011 +0.011 +0.011 +0.012 +0.012 +0.013 +0.014 +0.014 +0.014 +0.014 +0.013 +0.013 +0.013 +0.014 +0.017 +0.018 +0.019 +0.019 +0.021 +0.014 +0.012 +0.013 +0.01 +0.01 +0.013 +0.013 +0.011 +0.011 +0.012 +0.018 +0.02 +0.017 +0.013 +0.012 +0.015 +0.016 +0.017 +0.016 +0.015 +0.014 +0.014 +0.014 +0.014 +0.014 +0.014 +0.014 +0.014 +0.014 +0.015 +0.014 +0.009 +0.008 +0.008 +0.008 +0.008 +0.009 +0.009 +0.009 +0.01 +0.011 +0.011 +0.011 +0.01 +0.008 +0.008 +0.009 +0.011 +0.011 +0.011 +0.01 +0.009 +0.008 +0.007 +0.008 +0.008 +0.008 +0.008 +0.009 +0.011 +0.01 +0.01 +0.01 +0.01 +0.009 +0.009 +0.01 +0.016 +0.019 +0.018 +0.018 +0.018 +0.018 +0.019 +0.019 +0.019 +0.019 +0.012 +0.013 +0.009 +0.009 +0.01 +0.011 +0.011 +0.011 +0.012 +0.014 +0.015 +0.012 +0.012 +0.008 +0.009 +0.009 +0.013 +0.01 +0.009 +0.006 +0.012 +0.011 +0.011 +0.013 +0.013 +0.013 +0.01 +0.01 +0.008 +0.008 +0.006 +0.006 +0.007 +0.007 +0.008 +0.008 +0.008 +0.008 +0.009 +0.009 +0.009 +0.01 +0.011 +0.011 +0.011 +0.011 +0.011 +0.011 +0.011 +0.01 +0.01 +0.011 +0.011 +0.011 +0.011 +0.011 +0.012 +0.013 +0.015 +0.014 +0.012 +0.007 +0.004 +0.004 +0.002 +0.002 +0.003 +0.002 +0.003 +0.002 +0.002 +0.002 +0.002 +0.002 +0.002 +0.002 +0.003 +0.003 +0.004 +0.005 +0.004 +0.003 +0.003 +0.004 +0.004 +0.005 +0.005 +0.005 +0.005 +0.005 +0.005 +0.005 +0.005 +0.005 +0.006 +0.007 +0.006 +0.006 +0.006 +0.005 +0.005 +0.006 +0.005 +0.005 +0.005 +0.005 +0.005 +0.006 +0.007 +0.009 +0.007 +0.007 +0.01 +0.011 +0.012 +0.004 +0.003 +0.003 +0.002 +0.002 +0.002 +0.002 +0.002 +0.002 +0.003 +0.003 +0.003 +0.003 +0.003 +0.003 +0.002 +0.002 +0.002 +0.003 +0.003 +0.005 +0.006 +0.006 +0.005 +0.006 +0.008 +0.007 +0.007 +0.009 +0.009 +0.01 +0.01 +0.01 +0.009 +0.009 +0.009 +0.008 +0.007 +0.006 +0.006 +0.006 +0.007 +0.008 +0.008 +0.009 +0.009 +0.009 +0.008 +0.008 +0.008 +0.008 +0.008 +0.01 +0.01 +0.011 +0.009 +0.011 +0.013 +0.012 +0.013 +0.014 +0.013 +0.014 +0.015 +0.015 +0.016 +0.016 +0.017 +0.015 +0.013 +0.013 +0.011 +0.01 +0.011 +0.014 +0.016 +0.017 +0.016 +0.017 +0.017 +0.016 +0.015 +0.015 +0.015 +0.017 +0.019 +0.02 +0.017 +0.013 +0.011 +0.009 +0.012 +0.012 +0.012 +0.012 +0.012 +0.011 +0.009 +0.01 +0.012 +0.013 +0.015 +0.016 +0.02 +0.024 +0.024 +0.023 +0.019 +0.014 +0.012 +0.01 +0.01 +0.01 +0.01 +0.01 +0.01 +0.011 +0.012 +0.012 +0.013 +0.013 +0.014 +0.014 +0.015 +0.014 +0.015 +0.014 +0.012 +0.012 +0.012 +0.013 +0.012 +0.011 +0.012 +0.013 +0.012 +0.011 +0.011 +0.011 +0.011 +0.012 +0.014 +0.016 +0.015 +0.014 +0.013 +0.013 +0.014 +0.014 +0.013 +0.012 +0.013 +0.013 +0.013 +0.011 +0.011 +0.011 +0.011 +0.012 +0.012 +0.012 +0.012 +0.012 +0.012 +0.011 +0.01 +0.012 +0.029 +0.048 +0.048 +0.046 +0.049 +0.044 +0.041 +0.038 +0.033 +0.032 +0.03 +0.031 +0.032 +0.032 +0.034 +0.036 +0.034 +0.034 +0.035 +0.037 +0.038 +0.04 +0.042 +0.035 +0.02 +0.02 +0.022 +0.019 +0.017 +0.019 +0.02 +0.022 +0.023 +0.023 +0.023 +0.025 +0.025 diff --git a/data/samples/neon_wind_dir.csv b/data/samples/neon_wind_dir.csv new file mode 100644 index 0000000..5b969fc --- /dev/null +++ b/data/samples/neon_wind_dir.csv @@ -0,0 +1,1024 @@ +290.91 +350.03 +331.11 +299.86 +255.24 +139.19 +26.93 +54.0 +82.2 +245.5 +276.16 +270.98 +261.29 +274.81 +289.69 +315.91 +242.39 +226.89 +269.11 +257.04 +245.93 +296.35 +317.33 +313.63 +280.26 +313.39 +288.6 +286.99 +309.51 +312.98 +317.31 +351.52 +324.18 +303.35 +316.03 +300.18 +306.0 +318.81 +249.42 +225.52 +251.95 +253.4 +253.12 +244.43 +310.42 +286.49 +247.12 +255.94 +311.48 +351.74 +304.62 +286.13 +306.22 +303.83 +290.82 +275.71 +274.36 +235.86 +249.01 +279.37 +285.33 +249.02 +241.72 +242.67 +215.38 +219.67 +211.1 +246.66 +228.31 +205.35 +224.4 +228.72 +225.68 +229.05 +221.68 +226.37 +236.1 +234.52 +211.88 +220.92 +230.76 +229.19 +237.41 +218.25 +221.85 +267.87 +286.12 +276.28 +293.23 +264.67 +191.55 +165.91 +190.1 +239.28 +246.81 +243.64 +264.16 +207.09 +164.08 +168.36 +182.09 +176.25 +173.52 +194.09 +175.79 +216.85 +178.75 +192.93 +181.32 +186.55 +177.46 +180.43 +211.54 +190.7 +230.55 +221.88 +157.22 +181.57 +176.36 +213.56 +173.62 +187.67 +180.35 +176.97 +172.33 +155.13 +160.78 +184.75 +168.14 +160.81 +190.07 +164.56 +154.57 +219.48 +248.35 +207.8 +173.08 +222.89 +206.09 +175.21 +185.01 +192.41 +186.78 +308.15 +349.05 +115.53 +158.08 +189.99 +206.92 +268.87 +88.79 +109.82 +273.41 +301.64 +126.97 +111.68 +98.43 +115.54 +80.18 +338.79 +315.94 +35.72 +109.51 +134.23 +145.59 +174.09 +174.07 +144.81 +154.15 +167.5 +152.3 +119.99 +156.94 +162.3 +162.22 +175.82 +141.05 +166.99 +191.46 +176.29 +189.34 +202.85 +171.17 +193.99 +198.35 +207.5 +86.5 +170.26 +152.04 +174.93 +208.25 +157.51 +135.75 +189.12 +164.72 +127.83 +110.07 +151.78 +171.64 +143.45 +160.15 +192.48 +188.62 +184.78 +178.58 +181.74 +180.75 +178.75 +200.65 +193.41 +164.29 +122.45 +43.96 +332.34 +281.58 +145.89 +80.73 +201.52 +254.81 +152.32 +171.64 +160.97 +128.17 +146.28 +167.49 +173.75 +169.19 +175.14 +152.0 +101.12 +129.27 +138.27 +131.46 +149.75 +186.6 +137.18 +172.44 +181.16 +190.07 +190.42 +153.43 +170.14 +120.0 +93.1 +104.41 +216.44 +248.26 +229.32 +200.77 +206.98 +193.95 +189.28 +181.8 +176.19 +133.82 +35.02 +94.58 +118.66 +110.72 +76.52 +12.63 +262.63 +149.33 +95.83 +94.86 +148.37 +135.01 +186.86 +183.84 +173.41 +132.9 +149.97 +151.32 +141.44 +147.91 +164.41 +157.49 +168.03 +154.16 +123.84 +145.65 +161.98 +153.96 +168.2 +195.07 +200.08 +129.47 +87.3 +76.51 +74.8 +104.38 +354.31 +6.72 +34.77 +351.2 +273.75 +303.65 +199.36 +140.96 +142.19 +143.87 +87.24 +98.17 +168.66 +154.87 +330.43 +266.79 +224.92 +117.7 +280.09 +158.68 +128.62 +137.91 +149.93 +111.89 +354.42 +321.45 +271.71 +185.53 +165.68 +57.8 +290.97 +283.08 +76.26 +84.35 +105.67 +39.44 +313.76 +253.79 +230.88 +219.31 +260.01 +222.2 +101.47 +110.68 +98.63 +91.18 +296.57 +248.64 +244.93 +217.2 +206.75 +203.58 +184.14 +185.47 +166.72 +184.5 +163.6 +129.44 +110.99 +104.65 +99.33 +107.57 +143.25 +108.21 +102.53 +144.22 +128.08 +63.91 +316.42 +336.58 +62.38 +90.25 +340.64 +61.69 +263.52 +275.56 +298.73 +224.98 +229.36 +193.76 +184.67 +212.19 +177.95 +114.78 +130.95 +265.68 +266.53 +234.82 +167.31 +166.21 +182.91 +211.27 +162.28 +155.38 +179.29 +302.36 +338.16 +336.0 +62.77 +88.81 +142.96 +140.14 +207.73 +301.33 +315.99 +239.45 +242.73 +298.26 +309.29 +279.09 +145.16 +114.01 +164.21 +141.33 +114.69 +167.55 +232.4 +250.45 +181.2 +181.33 +180.65 +177.54 +170.72 +181.63 +169.05 +168.91 +166.77 +160.96 +212.5 +204.97 +205.25 +212.43 +194.91 +198.46 +153.73 +91.07 +34.83 +10.65 +45.97 +76.41 +125.21 +128.57 +250.84 +245.08 +217.34 +224.99 +232.23 +221.42 +169.09 +111.95 +106.91 +95.9 +111.04 +142.65 +167.89 +198.49 +232.02 +230.63 +202.81 +199.01 +184.89 +155.28 +143.93 +111.12 +109.71 +112.59 +128.79 +192.19 +163.75 +207.3 +250.17 +266.15 +272.69 +296.09 +279.48 +299.97 +295.91 +308.54 +319.24 +265.22 +131.61 +143.21 +185.31 +208.23 +160.97 +146.32 +183.18 +220.57 +223.31 +226.56 +223.93 +206.06 +183.1 +137.22 +155.55 +171.08 +162.95 +147.63 +153.9 +163.42 +153.2 +248.9 +212.19 +191.4 +221.33 +244.25 +303.94 +19.17 +24.43 +33.54 +26.99 +61.65 +90.47 +91.46 +124.9 +84.22 +137.37 +218.2 +224.1 +249.79 +238.26 +191.1 +158.08 +145.18 +142.85 +143.25 +163.59 +199.76 +194.77 +205.05 +168.58 +175.43 +173.78 +188.62 +186.0 +142.18 +140.84 +146.41 +127.45 +60.77 +14.73 +63.36 +212.14 +89.23 +42.76 +59.62 +129.58 +185.11 +196.19 +225.5 +211.16 +198.24 +207.5 +223.48 +232.01 +261.91 +348.42 +32.48 +42.89 +41.59 +57.6 +53.08 +75.42 +74.46 +51.86 +74.38 +76.01 +85.75 +78.28 +102.33 +195.63 +209.73 +225.65 +231.82 +34.89 +68.62 +88.78 +119.71 +138.92 +219.36 +148.16 +78.19 +348.62 +341.46 +46.6 +322.35 +356.49 +49.28 +237.77 +355.07 +50.86 +44.13 +344.67 +34.9 +47.52 +45.75 +63.68 +63.69 +49.84 +83.25 +26.64 +351.31 +323.44 +324.57 +326.09 +290.36 +57.58 +61.98 +80.98 +310.04 +249.31 +244.09 +275.61 +194.33 +194.35 +175.65 +151.82 +59.26 +333.13 +234.59 +220.97 +273.51 +208.94 +185.58 +173.15 +211.93 +196.7 +185.02 +212.17 +206.46 +193.76 +179.13 +193.37 +197.86 +223.19 +239.85 +182.18 +186.44 +62.22 +25.87 +16.79 +20.28 +32.58 +40.71 +63.49 +42.42 +29.5 +57.19 +64.39 +354.37 +339.97 +338.24 +306.02 +286.65 +325.52 +50.1 +51.65 +36.12 +353.0 +68.46 +99.41 +71.39 +299.65 +277.65 +66.63 +50.31 +38.56 +43.34 +67.7 +49.35 +48.56 +146.76 +347.23 +38.09 +347.28 +259.92 +222.06 +268.22 +274.62 +255.67 +241.17 +229.56 +43.32 +115.92 +259.88 +222.13 +101.23 +88.19 +98.2 +79.67 +76.25 +265.09 +291.91 +341.77 +44.19 +24.71 +78.93 +152.06 +230.77 +141.07 +32.31 +293.99 +264.67 +15.48 +13.86 +45.79 +90.63 +69.59 +67.78 +41.04 +45.26 +336.81 +356.61 +356.29 +19.66 +351.93 +347.26 +26.82 +353.17 +28.89 +26.26 +71.63 +47.58 +88.67 +129.9 +100.98 +86.44 +79.73 +89.27 +68.13 +62.54 +63.15 +59.14 +49.57 +49.47 +58.46 +60.43 +308.76 +345.43 +24.67 +12.33 +34.01 +45.1 +54.0 +181.76 +310.61 +292.96 +237.56 +219.35 +275.83 +333.95 +336.13 +330.08 +292.92 +242.75 +119.78 +82.72 +67.46 +282.08 +236.39 +275.38 +305.48 +328.53 +357.27 +56.29 +277.25 +167.04 +154.57 +168.72 +193.86 +209.55 +201.86 +195.3 +140.73 +145.0 +171.37 +176.98 +163.75 +155.45 +157.85 +149.14 +309.84 +83.72 +142.38 +203.55 +280.62 +271.87 +237.82 +242.47 +244.73 +250.24 +242.82 +354.38 +23.65 +57.13 +73.79 +116.62 +236.95 +59.16 +264.15 +263.22 +65.02 +87.84 +82.06 +277.77 +99.38 +150.34 +224.53 +55.85 +261.14 +83.25 +120.9 +250.58 +68.45 +184.95 +159.04 +91.73 +264.77 +266.69 +129.31 +96.93 +73.66 +197.17 +100.33 +268.58 +292.97 +240.13 +218.1 +218.36 +229.88 +137.25 +135.32 +78.65 +268.48 +253.3 +218.52 +236.96 +228.7 +73.6 +131.65 +253.49 +354.65 +278.35 +286.01 +296.86 +200.4 +71.9 +149.71 +247.24 +306.48 +78.04 +237.83 +194.01 +100.89 +231.75 +234.92 +194.45 +86.52 +263.34 +216.36 +216.94 +136.01 +335.73 +76.07 +72.56 +267.62 +210.87 +68.46 +98.36 +198.24 +242.88 +240.04 +318.31 +172.47 +205.45 +326.94 +153.68 +203.6 +70.81 +71.89 +28.04 +247.2 +103.4 +187.99 +318.58 +309.45 +256.89 +225.19 +130.95 +237.48 +193.58 +88.62 +161.37 +229.64 +98.66 +321.44 +222.7 +135.75 +159.84 +76.19 +30.76 +262.65 +258.24 +139.39 +194.14 +201.38 +295.24 +330.88 +315.99 +334.08 +242.81 +160.62 +254.98 +330.62 +88.66 +288.59 +1.47 +132.4 +149.04 +249.87 +183.89 +174.27 +175.43 +328.49 +194.53 +337.12 +274.72 +201.36 +187.12 +250.04 +259.37 +236.41 +195.77 +175.9 +161.35 +207.66 +175.05 +227.41 +217.12 +150.54 +220.29 +186.27 +152.35 +204.88 +208.12 +173.04 +137.54 +222.93 +220.46 +106.01 +94.55 +175.84 +205.67 +344.87 +165.25 +191.47 +207.82 +201.4 +195.4 +235.73 +231.05 +196.42 +189.13 +187.06 +238.39 +283.64 +250.52 +184.46 +207.38 +189.95 +208.92 +187.65 +59.87 +302.47 +258.06 +220.9 +196.28 +251.27 +109.08 +184.18 +181.55 +171.76 +168.39 +199.89 +210.76 +163.66 +169.91 +201.31 +220.09 +165.88 +198.3 +125.5 +157.77 +173.48 +180.94 +186.9 +190.51 +189.04 +167.11 +189.3 +165.71 +166.33 +137.12 +129.75 +164.53 +50.03 +172.64 +185.45 +232.37 +197.07 +227.98 +206.24 +194.98 +210.11 +212.99 +213.68 +149.67 +178.79 +212.07 +168.21 +160.5 +157.46 +194.77 +213.11 +248.54 +196.92 +97.44 +140.9 +107.17 +163.22 +166.62 +208.22 +145.18 diff --git a/data/samples/nyc29.csv b/data/samples/nyc29.csv new file mode 100644 index 0000000..b19425a --- /dev/null +++ b/data/samples/nyc29.csv @@ -0,0 +1,1024 @@ +-73.9178287967861 +-73.9045620114538 +-73.8503571690125 +-73.8333884075798 +-73.9643503153147 +-73.9643503153147 +-73.9643503153147 +-73.9643503153147 +-73.9643503153147 +-73.9643503153147 +-73.9643503153147 +-73.9142748990343 +-73.9142748990343 +-73.9142748990343 +-73.9142748990343 +-73.9142748990343 +-73.8887029914239 +-73.8887029914239 +-73.9643503153147 +-73.9643503153147 +-73.9643503153147 +-73.9643503153147 +-73.9643503153147 +-73.9643503153147 +-73.9643503153147 +-73.9643503153147 +-73.9643503153147 +-73.9643503153147 +-73.9643503153147 +-73.9211006385839 +-73.9847529488273 +-73.9852730173781 +-73.9844488026788 +-73.9718465721095 +-73.9732921659778 +-73.9732921659778 +-73.9676030066437 +-73.9653859845536 +-73.9708449979441 +-73.9653859845536 +-73.9653859845536 +-73.9686986216194 +-73.9529083396659 +-73.9558668531537 +-73.9558668531537 +-73.9558668531537 +-73.9558668531537 +-73.9574817316137 +-73.9478284760544 +-73.9492507504987 +-73.9555687652631 +-73.9409412662537 +-73.9371587003197 +-73.9343765901425 +-73.9340484578665 +-73.9211006385839 +-73.917059184637 +-73.917059184637 +-73.9211006385839 +-73.9279862083642 +-73.941525938106 +-73.9423027709709 +-74.0159272589091 +-74.0110347878594 +-74.0008512687761 +-74.0008512687761 +-73.9990257363632 +-73.9990257363632 +-74.0046900692044 +-74.0064659231207 +-73.9863359551485 +-73.9779039859433 +-73.9908951429948 +-73.9893366443239 +-73.9893366443239 +-73.9966778913622 +-73.9863580319865 +-73.9813802920869 +-73.982894182905 +-73.9974930360451 +-73.9974930360451 +-73.9784821236305 +-74.002197718052 +-74.002197718052 +-73.988823380241 +-73.988823380241 +-73.988823380241 +-73.9880191428587 +-73.9812839711967 +-73.987630489831 +-73.9820267832352 +-73.9806625542192 +-73.9642221915934 +-73.9530957582689 +-73.9600590568013 +-73.9496114529279 +-73.948549377931 +-73.9477756687571 +-73.9477756687571 +-73.9493807738092 +-73.9366177613284 +-73.9366177613284 +-73.899463107156 +-73.899463107156 +-73.899463107156 +-73.899463107156 +-73.899463107156 +-73.899463107156 +-73.899463107156 +-73.899463107156 +-73.899463107156 +-73.899463107156 +-73.923769051299 +-73.923769051299 +-73.923769051299 +-73.923769051299 +-73.923769051299 +-73.9190223135556 +-73.8854920574974 +-73.8854920574974 +-73.8854920574974 +-73.8854920574974 +-73.8854920574974 +-73.8854920574974 +-73.8854920574974 +-73.9161016420562 +-73.7911899471642 +-73.8459036159541 +-73.9136043349563 +-73.9214665264784 +-73.9161028341839 +-73.9035223022303 +-73.8949457697803 +-73.9097463064488 +-73.9097463064488 +-73.8835743725038 +-73.8907251168984 +-73.8776792824945 +-73.8869099655867 +-73.9052287876072 +-73.9007509772445 +-73.9007509772445 +-73.9179995674302 +-73.9027050840344 +-73.9024988117187 +-73.9049558486232 +-73.9066065548107 +-73.9027050840344 +-73.9027050840344 +-73.9027050840344 +-73.9027050840344 +-73.9007509772445 +-73.9007509772445 +-73.8854920574974 +-73.8854920574974 +-73.8789031538803 +-73.8789031538803 +-73.8789031538803 +-73.8789031538803 +-73.8789031538803 +-73.8789031538803 +-73.8789031538803 +-73.8789031538803 +-73.8789031538803 +-73.8994885776544 +-73.8994885776544 +-73.8994885776544 +-73.8994885776544 +-73.8994885776544 +-73.8994885776544 +-73.8994885776544 +-73.8994885776544 +-73.8994885776544 +-73.8994885776544 +-73.8994885776544 +-73.8836880154372 +-73.8836880154372 +-73.8836880154372 +-73.8836880154372 +-73.8836880154372 +-73.8836880154372 +-73.8836880154372 +-73.8836880154372 +-73.8836880154372 +-73.8836880154372 +-73.8836880154372 +-73.8836880154372 +-73.8836880154372 +-73.8836880154372 +-73.8836880154372 +-73.8452402691305 +-73.8375826525549 +-73.8375826525549 +-73.8375826525549 +-73.8375826525549 +-73.8783238814486 +-73.9049558486232 +-73.9027050840344 +-73.9043996114481 +-73.904473311996 +-73.904473311996 +-73.9049558486232 +-73.904473311996 +-73.904473311996 +-73.904473311996 +-73.9027050840344 +-73.9027050840344 +-73.9007509772445 +-73.9043996114481 +-73.9043996114481 +-73.9043996114481 +-73.9043996114481 +-73.9043996114481 +-73.9043996114481 +-73.9043996114481 +-73.904473311996 +-73.904473311996 +-73.904473311996 +-73.904473311996 +-73.904473311996 +-73.904473311996 +-73.904473311996 +-73.904473311996 +-73.904473311996 +-73.904473311996 +-73.904473311996 +-73.904473311996 +-73.904473311996 +-73.904473311996 +-73.904473311996 +-73.9027050840344 +-73.9027050840344 +-73.9024770852932 +-73.9032360297448 +-73.9013160843574 +-73.904754276545 +-73.904754276545 +-73.904754276545 +-73.904754276545 +-73.904754276545 +-73.904754276545 +-73.904754276545 +-73.9029871483743 +-73.9032360297448 +-73.9013160843574 +-73.8978794747497 +-73.9013160843574 +-73.9013160843574 +-73.904754276545 +-73.904754276545 +-73.904754276545 +-73.904754276545 +-73.904754276545 +-73.904754276545 +-73.904754276545 +-73.904754276545 +-73.904754276545 +-73.904754276545 +-73.9032360297448 +-73.9032360297448 +-73.9032360297448 +-73.9032360297448 +-73.9032360297448 +-73.9032360297448 +-73.9032360297448 +-73.9032360297448 +-73.9032360297448 +-73.9032360297448 +-73.9006624793962 +-73.9006624793962 +-73.9029871483743 +-73.9032360297448 +-73.9029871483743 +-73.9186811347498 +-73.9176697656444 +-73.9186811347498 +-73.9176697656444 +-73.9186811347498 +-73.9230666839022 +-73.9283236363615 +-73.9136043349563 +-73.926838129657 +-73.9136043349563 +-73.9283236363615 +-73.9200262439963 +-73.9155686294386 +-73.926838129657 +-73.9267476586166 +-73.9200262439963 +-73.9230666839022 +-73.926838129657 +-73.9267476586166 +-73.9230666839022 +-73.9283236363615 +-73.9160824118785 +-73.9155686294386 +-73.9155686294386 +-73.9155686294386 +-73.9155686294386 +-73.9230666839022 +-73.9230666839022 +-73.9230666839022 +-73.9283236363615 +-73.9283236363615 +-73.9283236363615 +-73.9283236363615 +-73.9283236363615 +-73.9205598410639 +-73.9200262439963 +-73.9200262439963 +-73.9200262439963 +-73.9185135954165 +-73.9185135954165 +-73.9136043349563 +-73.9230666839022 +-73.9230666839022 +-73.9267476586166 +-73.9267476586166 +-73.9283236363615 +-73.9140272029836 +-73.9140272029836 +-73.9109561448388 +-73.9135737246321 +-73.9140272029836 +-73.9143735673483 +-73.9140272029836 +-73.9143735673483 +-73.9109561448388 +-73.917960280314 +-73.9140272029836 +-73.9140272029836 +-73.917960280314 +-73.9140272029836 +-73.9140272029836 +-73.9109561448388 +-73.9135737246321 +-73.917960280314 +-73.9140272029836 +-73.9176680437909 +-73.917960280314 +-73.917960280314 +-73.917960280314 +-73.9140272029836 +-73.9109096846555 +-73.9076155631691 +-73.9109096846555 +-73.9109096846555 +-73.9109096846555 +-73.9109096846555 +-73.9076155631691 +-73.9076155631691 +-73.9076155631691 +-73.9076155631691 +-73.9076155631691 +-73.8970499474741 +-73.9109096846555 +-73.9109096846555 +-73.9109096846555 +-73.9109096846555 +-73.9109096846555 +-73.8970499474741 +-73.9076155631691 +-73.9214665264784 +-73.9055662773781 +-73.9055662773781 +-73.9055662773781 +-73.9034545710959 +-73.9055662773781 +-73.9055662773781 +-73.9055662773781 +-73.9214665264784 +-73.9214665264784 +-73.9214665264784 +-73.9214665264784 +-73.9214665264784 +-73.9214665264784 +-73.9214665264784 +-73.9214665264784 +-73.9055662773781 +-73.9055662773781 +-73.9055662773781 +-73.9111799732021 +-73.9162601502018 +-73.9161983851856 +-73.9111799732021 +-73.9034545710959 +-73.9214665264784 +-73.9214665264784 +-73.9055662773781 +-73.9111799732021 +-73.911169121825 +-73.9034545710959 +-73.9034545710959 +-73.9214665264784 +-73.9185327446047 +-73.9162601502018 +-73.9161983851856 +-73.9084265205605 +-73.9084265205605 +-73.9214665264784 +-73.9161028341839 +-73.9161028341839 +-73.9161028341839 +-73.9034545710959 +-73.9162601502018 +-73.9162601502018 +-73.9055662773781 +-73.911169121825 +-73.911169121825 +-73.9134321733787 +-73.9134321733787 +-73.9034545710959 +-73.9084265205605 +-73.9161028341839 +-73.9161028341839 +-73.9131968266341 +-73.9185327446047 +-73.9157690208399 +-73.9179995674302 +-73.9078093639263 +-73.9035223022303 +-73.9035223022303 +-73.896717083859 +-73.896717083859 +-73.896717083859 +-73.896717083859 +-73.896717083859 +-73.8985179200073 +-73.8985179200073 +-73.8985179200073 +-73.8984925502663 +-73.8984925502663 +-73.896717083859 +-73.896717083859 +-73.896717083859 +-73.896717083859 +-73.896717083859 +-73.896717083859 +-73.8949457697803 +-73.8978848659251 +-73.8978848659251 +-73.8978848659251 +-73.8949457697803 +-73.8993626402983 +-73.8993626402983 +-73.8949457697803 +-73.8949457697803 +-73.8697215977724 +-73.8697215977724 +-73.7510288345036 +-73.7510288345036 +-73.7510288345036 +-73.7510288345036 +-73.7510288345036 +-73.7510288345036 +-73.7510288345036 +-73.7510288345036 +-73.7510288345036 +-73.8341586116489 +-73.8341586116489 +-73.8341586116489 +-73.8341586116489 +-73.8341586116489 +-73.8341586116489 +-73.8341586116489 +-73.8341586116489 +-73.8341586116489 +-73.8341586116489 +-73.8341586116489 +-73.8341586116489 +-73.8341586116489 +-73.8341586116489 +-73.8341586116489 +-73.8341586116489 +-73.8341586116489 +-73.8341586116489 +-73.8341586116489 +-73.8341586116489 +-73.8341586116489 +-73.8341586116489 +-73.8341586116489 +-73.8341586116489 +-73.8341586116489 +-73.8341586116489 +-73.8341586116489 +-73.8341586116489 +-73.8341586116489 +-73.8341586116489 +-73.8341586116489 +-73.8341586116489 +-73.8341586116489 +-73.8341586116489 +-73.8341586116489 +-73.8341586116489 +-73.8341586116489 +-73.8341586116489 +-73.8341586116489 +-73.8341586116489 +-73.8341586116489 +-73.8341586116489 +-73.8341586116489 +-73.8341586116489 +-73.8341586116489 +-73.8341586116489 +-73.8341586116489 +-73.8341586116489 +-73.8341586116489 +-73.8341586116489 +-73.8341586116489 +-73.8341586116489 +-73.8341586116489 +-73.8341586116489 +-73.8039300306254 +-73.8949457697803 +-73.8949457697803 +-73.8949457697803 +-73.901759750788 +-73.901759750788 +-73.901759750788 +-73.901759750788 +-73.901759750788 +-73.901759750788 +-73.901759750788 +-73.901759750788 +-73.901759750788 +-73.9001613534693 +-73.9095284248379 +-73.8985325507501 +-73.8947422280043 +-73.899263698011 +-73.899263698011 +-73.892289128341 +-73.892289128341 +-73.8930796119833 +-73.8930796119833 +-73.8947422280043 +-73.8947422280043 +-73.8947422280043 +-73.8947422280043 +-73.8947422280043 +-73.8947422280043 +-73.8947422280043 +-73.8935243236158 +-73.8935243236158 +-73.8936538993119 +-73.899263698011 +-73.8947422280043 +-73.899263698011 +-73.8935243236158 +-73.8882671282473 +-73.8882671282473 +-73.8869131682214 +-73.8902321721105 +-73.8882671282473 +-73.8928039883946 +-73.8928039883946 +-73.8827147149622 +-73.8827147149622 +-73.8859345244846 +-73.8811393974359 +-73.8882671282473 +-73.8882671282473 +-73.8882671282473 +-73.8859345244846 +-73.8869131682214 +-73.8811393974359 +-73.8902321721105 +-73.8827147149622 +-73.8827147149622 +-73.8827147149622 +-73.8827147149622 +-73.8827147149622 +-73.8827147149622 +-73.8811393974359 +-73.8810447923587 +-73.8882671282473 +-73.8610061907765 +-73.8837430953994 +-73.8837430953994 +-73.8837430953994 +-73.8837430953994 +-73.8837430953994 +-73.9121042888013 +-73.8907251168984 +-73.9097463064488 +-73.9097463064488 +-73.9097463064488 +-73.9097463064488 +-73.9097463064488 +-73.9097463064488 +-73.9097463064488 +-73.9097463064488 +-73.9097463064488 +-73.9097463064488 +-73.9097463064488 +-73.9097463064488 +-73.9097463064488 +-73.9097463064488 +-73.9097463064488 +-73.9097463064488 +-73.8874172691845 +-73.8874172691845 +-73.8835743725038 +-73.8907251168984 +-73.8907251168984 +-73.8835743725038 +-73.8835743725038 +-73.8835743725038 +-73.8835743725038 +-73.8907251168984 +-73.8878293638284 +-73.8878293638284 +-73.8835743725038 +-73.8835743725038 +-73.8907251168984 +-73.8835743725038 +-73.8835743725038 +-73.8835743725038 +-73.8907251168984 +-73.8878293638284 +-73.9052074227367 +-73.9052074227367 +-73.9052074227367 +-73.8815465726662 +-73.8757204957781 +-73.8752170605517 +-73.8788749478325 +-73.8788749478325 +-73.8788749478325 +-73.8788749478325 +-73.8815465726662 +-73.8788749478325 +-73.8752170605517 +-73.8776792824945 +-73.8757204957781 +-73.8757204957781 +-73.8757204957781 +-73.8757204957781 +-73.8757204957781 +-73.8788749478325 +-73.8788749478325 +-73.8788749478325 +-73.8788749478325 +-73.8788749478325 +-73.8788749478325 +-73.8788749478325 +-73.8788749478325 +-73.8788749478325 +-73.8788749478325 +-73.8788749478325 +-73.8757204957781 +-73.8757204957781 +-73.8817619421546 +-73.8815465726662 +-73.8815465726662 +-73.8815465726662 +-73.8815465726662 +-73.8757204957781 +-73.8752170605517 +-73.8815465726662 +-73.8815465726662 +-73.8788749478325 +-73.8817619421546 +-73.8869099655867 +-73.8869099655867 +-73.8869099655867 +-73.8869099655867 +-73.9064951312952 +-73.9064951312952 +-73.8869099655867 +-73.8987397211706 +-73.8947583548884 +-73.8947583548884 +-73.8869099655867 +-73.8869099655867 +-73.8869099655867 +-73.8869099655867 +-73.8910401089752 +-73.8983087566039 +-73.8987397211706 +-73.8983087566039 +-73.8869099655867 +-73.8869099655867 +-73.8869099655867 +-73.8869099655867 +-73.8947583548884 +-73.9064951312952 +-73.9064951312952 +-73.9064951312952 +-73.9064951312952 +-73.8947583548884 +-73.8947583548884 +-73.8974899762421 +-73.8974899762421 +-73.8983087566039 +-73.8983087566039 +-73.8869099655867 +-73.8869099655867 +-73.8983087566039 +-73.8983087566039 +-73.8983087566039 +-73.8983087566039 +-73.8869099655867 +-73.9064951312952 +-73.8869099655867 +-73.8869099655867 +-73.8869099655867 +-73.8910401089752 +-73.9176680437909 +-73.9176680437909 +-73.9176680437909 +-73.9176680437909 +-73.8913373365553 +-73.9025728531831 +-73.9121042888013 +-73.9134856395468 +-73.9108236696828 +-73.9010989448924 +-73.9136969990299 +-73.9010989448924 +-73.9010989448924 +-73.9134856395468 +-73.9134856395468 +-73.9010989448924 +-73.9025728531831 +-73.9025728531831 +-73.9121042888013 +-73.9121042888013 +-73.9121042888013 +-73.9121042888013 +-73.9121042888013 +-73.9098955926662 +-73.9136969990299 +-73.9134856395468 +-73.9121042888013 +-73.8912556242193 +-73.8912556242193 +-73.9065557088437 +-73.9065557088437 +-73.9025728531831 +-73.9137078488323 +-73.8912556242193 +-73.9065557088437 +-73.9010989448924 +-73.9010989448924 +-73.9010989448924 +-73.9010989448924 +-73.9010989448924 +-73.9121042888013 +-73.9121042888013 +-73.9121042888013 +-73.9121042888013 +-73.9098955926662 +-73.9098955926662 +-73.9121042888013 +-73.9121042888013 +-73.9121042888013 +-73.9121042888013 +-73.9121042888013 +-73.9121042888013 +-73.9121042888013 +-73.9121042888013 +-73.9121042888013 +-73.9121042888013 +-73.9121042888013 +-73.9121042888013 +-73.9121042888013 +-73.9121042888013 +-73.9121042888013 +-73.9121042888013 +-73.9121042888013 +-73.9121042888013 +-73.9121042888013 +-73.9121042888013 +-73.9121042888013 +-73.9121042888013 +-73.9121042888013 +-73.9121042888013 +-73.9121042888013 +-73.9121042888013 +-73.9121042888013 +-73.9121042888013 +-73.9121042888013 +-73.9121042888013 +-73.9121042888013 +-73.9121042888013 +-73.9121042888013 +-73.9061868885927 +-73.9061868885927 +-73.9061868885927 +-73.9061868885927 +-73.9061868885927 +-73.9061868885927 +-73.9061868885927 +-73.9061868885927 +-73.9061868885927 +-73.9061868885927 +-73.9061868885927 +-73.9061868885927 +-73.9061868885927 +-73.9061868885927 +-73.9121042888013 +-73.9136969990299 +-73.9108236696828 +-73.9025728531831 +-73.9121042888013 +-73.9121042888013 +-73.9121042888013 +-73.9025728531831 +-73.9015245806151 +-73.9052287876072 +-73.9052287876072 +-73.9052287876072 +-73.9052287876072 +-73.9052287876072 +-73.9024210122663 +-73.8947837229935 +-73.8885612676137 +-73.8897549887099 +-73.8897549887099 +-73.8861837463674 +-73.85630208017 +-73.7852192526964 +-73.8297527785378 +-73.833863128003 +-73.8666169620233 +-73.8666169620233 +-73.8666169620233 +-73.8384712101351 +-73.8431926051123 +-73.8431926051123 +-73.8384712101351 +-73.8384712101351 +-73.851163289447 +-73.9021389689345 +-73.9021389689345 +-73.900609268538 +-73.900609268538 +-73.8939016476961 +-73.8939016476961 +-73.898912490491 +-73.8939016476961 +-73.8939016476961 +-73.8939016476961 +-73.8894354094374 +-73.8989091652134 +-73.8943976509137 +-73.8943976509137 +-73.8939016476961 +-73.8939016476961 +-73.8894354094374 +-73.8894354094374 +-73.898212381198 +-73.8989091652134 +-73.8989091652134 +-73.898212381198 +-73.8943976509137 +-73.8943976509137 +-73.8988716057198 +-73.8988716057198 +-73.8968778220034 +-73.8988716057198 +-73.8939016476961 +-73.8988716057198 +-73.8943976509137 +-73.8869528438326 +-73.8869528438326 +-73.8869528438326 +-73.8857696984004 +-73.8869528438326 +-73.8869528438326 +-73.8869528438326 +-73.8865399122529 +-73.9001045936277 +-73.9001045936277 +-73.9001045936277 +-73.9001045936277 +-73.9007214730505 +-73.9007214730505 +-73.907194163898 +-73.907194163898 +-73.9061862662413 +-73.9034772047069 +-73.9034772047069 +-73.9034772047069 +-73.9034772047069 +-73.9034772047069 +-73.9034772047069 +-73.9034772047069 +-73.9034772047069 +-73.9034772047069 +-73.9034772047069 +-73.9034772047069 +-73.9034772047069 +-73.9034772047069 +-73.9034772047069 +-73.9034772047069 +-73.9034772047069 +-73.9034772047069 +-73.9034772047069 +-73.9034772047069 +-73.9034772047069 +-73.9034772047069 +-73.9007214730505 +-73.9007214730505 +-73.9001045936277 +-73.9007214730505 +-73.8980748568653 +-73.8875866566702 +-73.8947837229935 +-73.8947837229935 +-73.8875866566702 +-73.8875866566702 +-73.8870942388797 +-73.8928166371847 +-73.8928166371847 +-73.8876388136702 +-73.8876388136702 +-73.8876388136702 +-73.8928166371847 +-73.8870942388797 +-73.8870942388797 +-73.8928166371847 +-73.8804631684171 +-73.8928166371847 +-73.8928166371847 +-73.8870942388797 +-73.8904578986841 +-73.8928166371847 +-73.8928166371847 +-73.8904578986841 +-73.8904578986841 +-73.8904578986841 +-73.8904578986841 +-73.8904578986841 +-73.8904578986841 +-73.8904578986841 +-73.8904578986841 +-73.8904578986841 +-73.8876388136702 +-73.8904578986841 +-73.8885612676137 +-73.8885612676137 +-73.8897549887099 +-73.8897549887099 +-73.8885612676137 +-73.8885612676137 +-73.8885612676137 +-73.8885612676137 +-73.8885612676137 +-73.8885612676137 +-73.8885612676137 +-73.8885612676137 +-73.8885612676137 +-73.8885612676137 +-73.8911163341796 +-73.8885612676137 +-73.8885612676137 +-73.8885612676137 +-73.8885612676137 +-73.8885612676137 +-73.8885612676137 +-73.8885612676137 +-73.8885612676137 +-73.8885612676137 +-73.8885612676137 +-73.8885612676137 +-73.8885612676137 +-73.8885612676137 +-73.8885612676137 +-73.8885612676137 +-73.8885612676137 +-73.8885612676137 +-73.8885612676137 +-73.8885612676137 +-73.8885612676137 +-73.8885612676137 +-73.8885612676137 +-73.8885612676137 +-73.8885612676137 +-73.8885612676137 +-73.8885612676137 +-73.8885612676137 +-73.8885612676137 +-73.8898148030972 +-73.8898148030972 +-73.8898148030972 +-73.8885612676137 +-73.8874047953542 +-73.8897549887099 +-73.8885612676137 +-73.8885612676137 +-73.8897549887099 +-73.8897549887099 +-73.8897549887099 +-73.8897549887099 +-73.8897549887099 +-73.8885612676137 +-73.8885612676137 +-73.8885612676137 +-73.8885612676137 +-73.8897549887099 +-73.8885612676137 +-73.8885612676137 +-73.8885612676137 +-73.8885612676137 +-73.8885612676137 +-73.8885612676137 +-73.8885612676137 +-73.8885612676137 +-73.8885612676137 +-73.8885612676137 +-73.8885612676137 +-73.8885612676137 +-73.8885612676137 +-73.8885612676137 +-73.8885612676137 +-73.8885612676137 +-73.8885612676137 +-73.8885612676137 +-73.8885612676137 +-73.8804631684171 +-73.8804631684171 +-73.8847883365959 diff --git a/data/samples/poi_lat.csv b/data/samples/poi_lat.csv new file mode 100644 index 0000000..f4af343 --- /dev/null +++ b/data/samples/poi_lat.csv @@ -0,0 +1,1024 @@ +0.6831751987355026 +0.6481958916434496 +0.2583558531849646 +-0.09017534468637368 +0.1495165392541809 +0.6956349103400177 +0.6252545082533464 +0.6016149931624454 +-0.11225957748827528 +0.3658881086557751 +0.8160720108419833 +0.8482348646060551 +0.9165887455056887 +0.8092806186239532 +0.42256690118810314 +1.0813381306203314 +0.7016223593017205 +0.6765332513043021 +0.155612334377338 +0.8813864241203253 +0.6891141663290944 +0.6974384172337452 +1.1317248164980451 +0.743030295805286 +0.8061389774889952 +0.7374209530334804 +0.669774948589635 +0.9238561025855206 +0.16965036661697883 +0.7830054139557003 +0.8955866168400236 +1.0347426877288937 +0.5213298475707062 +0.7413722330158913 +0.6362452344040996 +0.7958168094044922 +0.9442754852064921 +0.8747396285523136 +0.18005980116408166 +0.6353531772308579 +0.7178393769348345 +0.6551093347360715 +0.7474954298083046 +0.003781546712654381 +0.6617706747145166 +0.15038920388017807 +0.5851652649623987 +-0.7494055957118763 +0.1617338440181412 +0.5836429500037148 +0.7025580497062619 +0.6528210141612345 +-0.1540107620780663 +0.8819488079904123 +0.6955137069197403 +0.8118689903860289 +0.5261295030136905 +0.889505307949658 +0.6463535996552333 +0.7545882539629373 +0.7513695759340511 +-0.6167799651075517 +0.756294798120443 +0.8580281009644678 +0.7635136738321638 +0.7286749627076327 +0.6621779182066486 +0.5777233749573674 +-0.7499098019402303 +0.7670770543883189 +0.6830830841360918 +0.6419746238307393 +0.9043714407417285 +0.3232053068720659 +1.008572445222601 +0.6457572788274687 +0.7720173057988251 +0.6075878977137148 +0.5051855519897587 +0.7219554450874545 +0.16318828506146982 +0.6304132748862021 +-0.32317679982761666 +0.794963359152833 +0.8259916129167498 +0.7739565605232633 +0.5965784577548708 +0.6024294801467094 +0.6658964391407587 +0.9395689139902808 +0.7546270390574261 +0.9481161791882419 +-1.5004013802977918 +0.16406094968746698 +0.636827010821431 +0.6967645262170029 +0.8860454835957879 +0.7370040617490945 +-0.45134699270254475 +0.4495871190401171 +0.5860427777252072 +0.9043714407417285 +0.8194175394009042 +-0.46777732835534686 +0.6663085307697018 +0.9104800931237086 +0.738308210551279 +0.6934774894590803 +-0.1324850346368029 +0.907862099245717 +0.6163339365209309 +0.6088435651477886 +0.5899212871740833 +0.9241518389309975 +0.8628810859123743 +0.6641365654783311 +0.7317292888986225 +0.9351765020394284 +0.5246362768758733 +0.7287592233254094 +0.11248162215422344 +0.918334074757683 +0.7474033152088939 +0.9247336153483289 +0.8645246042913355 +0.4635546011928829 +0.5922047596121093 +0.7368198325502728 +0.9415954351773185 +0.7429284849322529 +0.7082109772279991 +0.6348101859080153 +0.7602848147159743 +0.8123731966143828 +0.941314243242275 +1.017667549880216 +0.9250778330619167 +0.8550755856465109 +-0.46919298430418677 +-0.9896016858807848 +0.028759147563417676 +0.7466615502767964 +0.8552113334772214 +-0.6680029545851797 +-0.23882891558817962 +0.5444893971173087 +0.7357435461782097 +0.8801792380543625 +0.6667594074931338 +0.5853591904348426 +0.895353906273091 +1.0888478945407178 +0.9273516092263204 +0.6496600289604004 +1.0914125589137873 +-0.6945101425998436 +0.6430762591709329 +0.7408728749243484 +0.6849350723979302 +0.7364319816053851 +0.5304346485019433 +0.8830735757305864 +0.9116048608638827 +0.6950143488281976 +0.6987134772150633 +0.6667933444508114 +0.6105113242108053 +0.7493571143437653 +0.8212210462946318 +0.8133864572079018 +1.110203937193593 +0.9345898774852859 +0.3352777493081107 +0.9668590760999365 +0.701346015503488 +0.6590072367321922 +0.7220524078236762 +0.08408220634218903 +0.7192938179781632 +0.6617464340304611 +0.938114472946952 +0.6707106389941764 +0.657426744131775 +0.7350648070246563 +0.9189158511750144 +0.6044219189319653 +0.597275910716515 +0.8997172294030767 +0.526100414192824 +0.6183362170239133 +0.579027523759552 +0.508763476956347 +0.8928377232681326 +0.7442277855976264 +0.8164553278093252 +0.4080164402800768 +0.5666502304808257 +-0.2996148549256933 +0.6313266929502335 +0.5493136326139254 +0.8037648448926019 +0.9029169996983999 +0.693361134175614 +0.6512356734240063 +0.9061167699937228 +0.882845713300465 +0.6174344635770496 +0.8556670583374645 +0.9482955602502524 +0.711512558396355 +0.7190320185903638 +0.6297833331992799 +0.5246750619703621 +0.9187994958915482 +0.6501060575470212 +0.1346812406122291 +0.5339204588691209 +1.0477793276139291 +0.7477252314931508 +0.7582825342129917 +0.8006455536683431 +0.5529687883999146 +0.8161838321479037 +0.6014598527844904 +0.5919478083611212 +0.5430010191163025 +0.9226973978876689 +-0.5916666164260777 +1.0055520559892885 +0.7334852840516015 +0.9001778024001309 +0.4132067004096575 +0.6144722519854703 +0.23277359271112152 +0.744964702392913 +0.2981022362406315 +0.5770894325879485 +0.6006681520432384 +0.8869181482217852 +0.8030502295266465 +0.7213445798492566 +0.6955137069197403 +0.6292736136697443 +0.9061167699937228 +0.6296954015723097 +0.607646075355448 +0.5396994379479465 +0.7356465834419876 +0.6300396192858975 +0.22963200005753173 +0.9029169996983999 +0.5707226654021458 +-0.6358816241432673 +0.6992904054955835 +0.7457861222128168 +0.6760591035241769 +0.6370451769779303 +0.8749917316664906 +0.30668004470050253 +1.0172360657040282 +0.7217081901100886 +0.5566109932184138 +0.885235844748335 +0.7632033930762536 +-0.5068872480104532 +1.1868238913561442 +-0.5486151615435509 +0.8310676121579665 +0.5367857077244782 +0.358679705695268 +0.1387536755335492 +0.9202539369348769 +-0.5581853836086531 +0.7240013588217368 +0.7313608305009793 +0.8539653623167699 +0.664083235973409 +0.6595114429605461 +0.7196574282389953 +0.8198781123979586 +0.4529129408925285 +0.6775077268033323 +0.8779006137531478 +0.6111852152275475 +0.11926416555294585 +0.4121885916793275 +-0.06670066624704996 +-0.3443486132816701 +0.7718670135576812 +0.7609490094590944 +0.7426375967235872 +0.6785403799440954 +0.6093283788288981 +0.5657921102652617 +0.47284306156990347 +0.5732097595862377 +0.5283984310412833 +0.7939040739884108 +0.9306047090265656 +0.7977996973602302 +0.5867215168787605 +0.7972271324028398 +0.8310676121579665 +0.6926823950220605 +0.5851167835942879 +0.6624348694576366 +0.7472918080622387 +0.39501164295761665 +0.6020407115872105 +0.7377236222145972 +0.681543800698569 +0.7103490055616922 +0.9410233550336092 +-0.19489509980603345 +0.644351319152251 +0.5265076576849561 +0.6854004935317953 +-0.22795454472089274 +-0.4569853758138486 +0.5135679805361426 +1.0806448470563446 +0.40336983081994504 +0.8666480882145954 +0.8152675342906068 +0.6086540613700419 +0.8847461829304144 +0.807088291157976 +-0.36710091933614064 +0.8649997216988229 +0.4881007178674587 +0.9085796234937591 +0.8193108803910603 +0.4047806696600494 +-0.6158006414717103 +0.9463369129785699 +0.7816166166847939 +1.0786134777324956 +0.8340055830654903 +-0.6672975506791653 +0.9264837927371343 +0.9401506904076121 +0.7495655842266424 +0.6790591305828827 +0.7849036534427166 +0.7941901140602654 +0.17453292519943295 +0.9367327539557899 +0.5441473920933616 +0.3502584076968148 +1.0355474784395358 +0.6862052842424373 +-0.4556611072438979 +0.8104145493427003 +0.9232791743050003 +0.14166255762020644 +0.7216936456996552 +0.8121986636891835 +0.9186249629663489 +0.7687835985458246 +1.0837912878467455 +0.883097816414642 +-0.06942047099807445 +0.5012585611727715 +0.7289222176849984 +0.7782859466955714 +0.7120187038794332 +0.6994455458735386 +0.9083760017476932 +0.915129456325549 +0.14777121000218657 +0.655744440658325 +0.7566002307395419 +0.9238609507223318 +0.8880089790042817 +0.21332529189341246 +-0.6137741202846725 +0.5229151883079344 +0.5441419379394491 +0.548164284820119 +0.7780241473077723 +0.6312565016254823 +0.9119345341670372 +0.07039494649710462 +0.6554632487232814 +0.603932402558149 +1.3041972835527629 +0.7839825074486084 +0.9030382031186772 +0.9004977794296632 +0.8011843755935284 +0.7051518029001979 +-0.5856161416858306 +0.8900306520545084 +0.7666988997170534 +0.7084049027004429 +0.7080655331236663 +0.3124915062959625 +0.3561829152375539 +0.7059856824317063 +0.824898842879529 +0.9394428624331924 +0.4157422759618604 +0.7377603710916252 +0.6818565055228848 +0.7236813817922044 +0.8228985016312709 +0.7669897879257193 +0.7828430013725286 +0.9102861676512647 +0.4206505296694134 +0.834897640238732 +0.3280719323437786 +0.7825280664052799 +0.8898124858980091 +0.6141910600504267 +0.8734401339614675 +0.9285151620609834 +0.8542320098413801 +0.7068971321521922 +0.6337678364936298 +0.8727131073652756 +0.35459330813993195 +0.9034987761157313 +0.32096604944175716 +0.7653268769995135 +0.6957803544443506 +0.8961926339414106 +-0.3383029381968661 +0.5153957281139255 +-0.3684477317422629 +0.7134033317526822 +0.8962411153095214 +0.7772532935548081 +0.7465354987197078 +0.879645943005142 +0.8203435335318235 +0.4524959991875196 +0.10501064332832549 +0.661945207639716 +-0.5561782549688596 +0.8835874782325626 +0.1490850550779934 +0.7351326809400116 +0.4130612563053247 +-0.2119799339283335 +0.5412072084961973 +0.2731644870744554 +-0.7897469421170009 +0.5296977317066568 +0.7426763818180759 +0.19693131726669352 +0.9419978305326395 +0.6753648503328279 +0.8207250818988568 +0.7074401234750348 +0.8051450609613526 +0.6821328493211172 +0.5446251449480195 +0.9447612685149639 +0.6996990064660228 +0.6140248659205424 +0.8588377398119208 +0.7102859797831478 +0.8668323174134169 +1.0986183446561184 +0.7103490055616922 +0.9048707988332713 +0.5848598323432999 +1.0661052847598698 +0.528786281986171 +-0.39849745332479414 +0.9439322371202665 +0.6213032767523038 +0.6479389403924616 +0.8459756328520849 +0.5654575888252962 +0.5946191317440348 +-1.1748974748008494 +0.6702936992284223 +0.794997474283417 +0.5717192484050345 +0.7073383126020019 +0.5212828206436386 +0.7639548542819734 +1.0619261908287054 +0.6996976489877156 +0.2879793265790644 +0.6138807792945167 +0.7473548338407832 +0.6763538702422914 +0.6118881950651563 +0.3195067602616175 +0.6676950978976751 +0.898919856341756 +0.6859628774018824 +0.6834757832177906 +0.6734062030611455 +0.7086618539514309 +0.6639523362795094 +0.421517103096321 +0.7485862605908012 +0.656956474861099 +0.7138736010233585 +0.9139707516276973 +0.6825740297709267 +0.7673630944601736 +0.5482143042111519 +0.15321033469055445 +0.9942918703945747 +0.1105375192929742 +0.7349387554675679 +0.45132275201848926 +0.7940857821560906 +0.8486663487822428 +0.1768018532270256 +0.6159460855760432 +0.6013628900482684 +0.712234445967527 +0.6856913817404611 +0.5716098744385761 +0.19761054123392796 +0.8586825994339657 +0.7105186903500805 +0.8491026810952413 +0.7609891403963619 +0.766781318042842 +0.9465502309982581 +0.5978140539025466 +0.7795803992241338 +0.20924703920791904 +0.5766955311683207 +0.6622529673644844 +0.67918033400316 +0.9216162633787945 +0.560919732770111 +-0.6252060268852355 +0.027547113360643832 +0.7272220730680835 +0.7926729051592709 +-0.14716422327343742 +0.7673630944601736 +0.6746376298111636 +0.945386678163595 +1.0308302413223396 +0.9090256520803799 +0.9274728126465978 +0.8992114329858489 +0.7026841012633503 +0.8515122050903556 +0.5486203044470801 +0.37728200663944095 +0.9014577105182601 +0.817104978142012 +0.7966089949594252 +0.6595453799182238 +0.8122810335336039 +0.19935053753543006 +-0.9587675357622183 +0.9182322638846501 +0.8652566729498111 +0.6551093347360715 +0.24066709829563476 +0.3417936451822229 +0.5535214759963794 +0.6510950774564846 +0.7797840209701997 +0.5875940651494741 +0.6731153148524799 +0.6459996856680233 +-0.5852961646562983 +0.6597926348955896 +0.9292326863090254 +-0.6270434707366406 +0.46876634826481034 +0.9385750459440061 +0.6405115947978633 +0.7206658406957029 +0.7121719050026639 +0.7184938754043324 +0.7420315796222002 +0.7414401069312467 +0.8157184110140387 +0.4526511415047293 +0.6493400519308682 +0.9475295546340993 +0.5561297736007488 +0.896265355993577 +0.8013954149889153 +0.6496697252340227 +0.7058256939169401 +0.5718280405950755 +0.9458569474342714 +0.4683154715413784 +0.7033676885537148 +0.7874925584998415 +-0.055128163678965336 +0.9805841514121474 +0.635813750227912 +0.027925268031909273 +0.7109259338422125 +0.6128869112482421 +0.8038224407579178 +0.8159917489674483 +0.624929683087003 +0.6516815080851548 +0.7443635334283372 +0.5577975326637655 +0.7347496781319351 +0.7720173057988251 +0.5629753427780153 +0.5791099420853406 +0.5841810931897465 +0.6519822864929151 +0.6863458802099591 +0.6190101080406556 +0.9016855729483816 +-0.6860210550436155 +-0.5966058982092216 +0.8037241205433887 +-0.616392114162664 +-0.5157447939643244 +0.6984128927327753 +0.6799754284401798 +0.668756839859305 +0.6259187029964666 +0.14718943358485512 +0.6695422380227025 +-0.5963208277647293 +0.4596033696918401 +0.9005898940290741 +0.7657971462701898 +0.4607717706633141 +0.9049435208854376 +0.6638020440383656 +0.49368577147384046 +0.7389433164735326 +0.7156868041907081 +0.9186249629663489 +0.6725335384351483 +1.0970121569306026 +0.5417754023734387 +0.3723369070921236 +0.7950411075147168 +0.6830491471784141 +0.6371275953037189 +0.2930407814098479 +0.6968517926796026 +0.8077577703702201 +0.6470808201768977 +0.5456965831832715 +0.47388598073732696 +0.8016316162143519 +1.0789092140779724 +0.7247985864389532 +0.7720173057988251 +0.6873591408034778 +-0.6592981249408579 +0.6972978212662235 +0.7583601044019695 +0.22860388865133305 +0.9093601735203455 +0.7508825321100084 +1.044870445527272 +0.708560043078398 +0.6845035882217427 +0.6858707628024716 +0.6807074970986551 +0.8087807272373612 +0.6087805393692443 +0.7047901318940902 +0.713907537981036 +0.9060440479415562 +0.23470799929874858 +-0.41815179995697477 +0.5987351998966548 +-0.7039494649710463 +0.6662212643071022 +0.7056850979494182 +0.618481661128246 +0.8439539598018581 +0.6980153455142655 +0.6792869930130041 +-0.024555812948197996 +0.7323692429576872 +0.42556944927795065 +0.8755298748525222 +0.9168796337143544 +0.9061167699937228 +0.8975064790172174 +0.9791297103688189 +0.2263110263419314 +0.3743731245527837 +0.664732886306096 +0.5737527509090803 +0.7939454285954093 +0.5379395642855189 +0.5086762104937473 +0.6073745796940266 +0.5556013266883393 +-0.4104384142905221 +0.991187026618013 +0.7091612120429737 +0.7243164877144578 +0.5579332804944761 +0.6824091931193497 +0.6414424370655938 +0.9421384265001612 +0.6904231632680902 +0.7255963958325871 +0.7492504553339212 +0.8970992355250854 +0.7101453838156261 +0.5396606528534578 +0.0853466004225227 +0.7035228289316697 +0.817104978142012 +0.9096074284977115 +0.48432886742842646 +0.9916882095482517 +-0.7957731761731924 +0.9287672651751603 +0.5663418404982719 +0.3351691510435421 +0.4588616047597426 +0.7283840744989669 +0.6893905101273269 +0.8476482400519129 +0.6739346499735549 +0.6782252510513743 +0.7844358082404458 +0.9226973978876689 +0.764262226155797 +0.6660855164763914 +0.861901762276533 +0.6756799792255492 +0.7602023963901855 +0.2517152632320711 +0.6100749918978068 +0.6237079526106069 +0.8867872485278855 +0.5766567848589265 +0.7066207883539598 +-0.46539204504428794 +0.6805669011311334 +0.5351034042510281 +0.6958821653173836 +0.8932643593075089 +0.9345947256220969 +0.772982085024233 +0.6943840910427551 +0.25947228212982365 +0.7520235895898679 +0.6926581543380051 +0.7182611648373997 +0.27459846898044116 +-0.10501064332832549 +0.7269296334556382 +0.663263900852334 +0.5580399395043203 +0.13962634015954636 +0.5807583086011131 +0.90937956606759 +0.5152599802832148 +0.7702708362650528 +1.0090911958613882 +0.6853520121636844 +0.9767153382368933 +0.7657971462701898 +0.921533845053006 +0.47039047409652723 +0.8764461727098193 +0.32440337844082384 +0.71786361761889 +-0.5404412028800442 +0.8892404057542997 +0.5779071038284701 +0.8086771225537083 +0.6990916318863287 +0.9186249629663489 +0.8616690517096005 +0.6220014084531014 +0.3607304675663613 +0.11839150092694868 +0.7637027511677965 +0.8648106443631901 +-0.3158318725588072 +0.9145525280450286 +0.8089601082993718 +0.6528258622980457 +1.1111493238717567 +0.6341265986176509 +0.4971812781146403 +0.8222197624777176 +0.671757836545373 +0.5035468817476085 +0.6462275480981449 +0.951485634271953 +0.8628083638602078 +0.6969051221845246 +1.0506445764692864 +0.6155339939471002 +0.9446109762738198 +0.7391178493987322 +0.6243220993063271 +0.6515895874112164 +0.9195751977813234 +0.6081133387813015 +1.1316133093513903 +0.003781546712654381 +0.18529565511148868 +0.6672733099951098 +0.8035495391368213 +0.7571820071568732 +0.9274970533306532 +0.5914048170382785 +0.8128967953899813 +0.7299209338680841 +0.7010017977899002 +0.7005169841087906 +0.6228469235129563 +0.7730305663923441 +-0.18704111817205898 +0.6450562382445842 +0.7532113831085863 +0.8037726019114997 +0.908729915734903 +0.6791997265504044 +0.7294458164605967 +0.7004442620566244 +0.7311232717972356 +0.726915089045205 +0.2968426902971089 +0.7056947942230405 +0.4002621751240329 +0.8030502295266465 +0.6777258929598315 +0.7307499652627815 +0.1943474213045218 +0.7441453672718379 +0.7711446411728278 +0.8389215937919411 +0.8142919922014782 +0.6680393156112628 +0.5724970834750066 +0.9050792687161484 +0.8843001543437936 +0.36875316436136196 +0.21664384154060726 +-0.45540876172288036 +0.29350620254371307 +0.7483777907079241 +0.9660353776557313 +0.6841787630553994 +0.6895408023684707 +0.7070765132142027 +0.5522706566991168 +0.6191216151873107 +0.94593936576006 +0.665663728573826 +0.6156697417778108 +0.702000513972986 +0.7743298670577176 +0.7606726656608619 +0.5094955456148225 +0.3979350694547072 +0.4648393574478231 +0.6984807666481306 +0.2777982392757641 +0.6540718334584972 +-0.6115973068564907 +-0.49305551368839806 +1.014618071826037 +0.6832964021557799 +0.6112553367391286 +0.9383859686083735 +0.8843001543437936 +0.4833514830473096 +0.6848817428930082 +0.6127996447856424 +-0.8165232017246805 +0.9392586332343706 +0.9042356929110178 +0.7574147177238059 +0.9099225573904326 +0.7224063218108863 +0.12886347643891466 +0.4420434181620527 +0.6210754143221822 +0.12039378142993104 +0.8173958663506776 +0.7242146768414249 +0.9162833128865897 +0.8704490274744942 +0.7606726656608619 +0.7513642429835587 +0.7210294509565351 +0.7234874563197605 +0.39104084437640435 +0.8024490605620708 +0.9075712110370514 +0.5341311395023838 +0.7261345390186187 +0.7330528302480517 +0.7098690400173937 +-0.4647811798060899 +0.4870341277690176 +0.606114064123142 +0.720263445340382 +0.8182685309766747 +0.35361825086448445 +0.8860454835957879 +0.6733577216930345 +0.6983110818597423 +0.48171087355043496 +0.6484140577999489 +0.7301681888454499 +0.7695835411196553 +0.5993800020925304 +-0.612319679241344 +-0.5538511492995339 +0.9331693733996349 +0.6944131798636217 +0.6410739786679506 +-0.5720559030251969 +1.0961976699463385 +0.600684150894715 +0.6681265820738627 +0.7110713779465454 +0.34330626386728463 +0.40525575603946107 +-1.4646221306319083 +0.73258886355523 +0.5240205635008641 +0.34528430368621155 +0.9171705219230202 +0.9363691436949578 +0.7092339340951402 +0.3472622882363788 +0.7033676885537148 +0.41420851940030223 +0.5338865219114431 +0.43725588887886024 +0.9270119487613352 +0.20507618710933373 +0.8961683932573551 +0.09046623289503941 +0.8252110628901633 +0.6317316190329697 +0.5453912533446729 +0.7623695135447451 +0.8590898429260978 +0.8004627789105648 +0.9265322741052453 +0.9431129019991914 +0.7089042607919858 +0.49079628193442765 +0.6273973847238504 +-0.33597588100890846 +0.9738937226128358 +0.7915068157794285 +0.715226231193654 +-0.5867215168787605 +0.7615647228341034 +0.1860617945362177 +0.7913626690365062 +0.8615042150580232 +0.9247481597587622 +0.21048670779051615 +0.5422374451316484 +0.6969681479630688 +-0.5232642541583333 +0.665552221427171 +0.6500527280420992 +0.5602492354491365 +0.7312202345334577 +-0.29015129187043515 +0.26890985917177435 +0.6579842798650511 +0.7822420263334251 +0.8919359698212689 +0.7714597700655492 +0.9064076582023883 +0.9221156214703375 +0.5757550314120627 +0.6904910371834454 +0.877609725544482 +0.8096436955897361 +0.6368657959159196 +1.1004300933824247 +0.5639982996451565 +0.3520329101272563 +0.5768167733736926 +-0.24646957920246593 +0.9066985464110543 +0.16968478838833762 +0.5587332230683069 +0.9285151620609834 +-0.5890728632321417 +0.7991086942992259 +0.4057890510886816 +1.2010434766230866 +0.4731238536306228 +1.0353331907924852 +0.5890534706848972 +0.6271598260201067 +0.7718718616944923 +0.8866272600131193 +0.7030089264296937 +-0.10940063121077234 +0.6775513600346321 +0.9250245035569946 +0.8459029107999184 +0.808669220090706 +0.49637256429169085 +0.8034875314670072 +0.7675764124798617 +0.8969004619158304 +0.8892210132070553 +0.5838902049810808 +0.7556693884718115 +0.16406094968746698 +0.6498442581592221 +0.7362380561329414 +0.3565901587296859 diff --git a/data/samples/poi_lon.csv b/data/samples/poi_lon.csv new file mode 100644 index 0000000..f3a40c3 --- /dev/null +++ b/data/samples/poi_lon.csv @@ -0,0 +1,1024 @@ +0.30717794835100204 +-2.1368744771320127 +-1.7110044433717744 +0.31212304789831924 +-1.449825617084445 +-1.707031492217818 +-1.423073598160821 +0.8685727985286003 +1.977797412086352 +-1.4963822748813942 +0.30689190827914736 +0.09808265582527022 +0.23908872723988622 +-1.191235695854241 +-1.9495133819304218 +0.3449352378358126 +2.6305699448794755 +-1.3602999227307582 +0.15329808596683528 +1.20136830178943 +0.856302164259718 +-1.063836356732277 +0.3825179943954239 +1.3220675158384603 +-0.8961393044364884 +-1.241071051613797 +-0.6512308252871952 +0.3246312408709453 +0.39473529915938416 +0.48549242026308936 +-1.5989106721624384 +0.3235937395933709 +0.12749145372137466 +0.28000932903120046 +-1.5332281146457187 +-1.9950975034830647 +0.2885465585859625 +0.29361770969036827 +0.36607796246899943 +-1.486618127343848 +0.2882847591981633 +0.8689848901575434 +0.7882149308846946 +0.26267205242514663 +2.0245993856059425 +-1.077159036689167 +-2.0891445702267792 +0.1372992344902206 +-1.7107184032999199 +0.26790804018112957 +0.2944467410850656 +-1.7501143630268805 +2.9132454097872014 +-1.5450333277807358 +1.5978489302008085 +-2.074352962993769 +0.3420845333908886 +0.39697029022929914 +0.9585784584265856 +0.3673918075448063 +0.4086979331753388 +-1.3297227238631797 +0.3982259576633729 +2.4931495069689777 +-2.079079838206945 +-1.4525696625195252 +-1.351427832366454 +-2.1460956333467163 +0.3235743470461265 +0.10103032300641619 +0.3554653909895118 +2.4241471392894924 +2.668317538090664 +0.9962387851751744 +-1.5957109018671158 +0.38600865289941255 +0.2531212229072887 +0.33830298667823416 +-1.6088590488988062 +0.27745310041618226 +0.9491779211498718 +-1.4264624457917767 +-0.8157523479717164 +1.0649174912411512 +-0.11373728958829712 +0.20274908144000794 +0.47917529799823205 +0.1850049007113989 +1.3091876685764052 +-1.2282900055014425 +-1.3177914591710742 +1.0630848955265573 +0.2671590030438154 +0.8826323952807769 +1.2790354535031778 +-1.566452396212155 +1.1932525207676563 +0.10770426814057008 +0.8682334289518236 +-1.6145992428831433 +1.6284228470736914 +-1.2471007763284927 +2.449369904601028 +0.26238601235329195 +-1.6486549799126824 +0.8610484901977802 +0.35669196960271893 +0.09541133244235668 +0.3964806284113785 +-0.1646427261047984 +-0.1660002044119051 +-0.0028381332261729016 +1.3521600710975685 +-1.517277744537215 +2.492858618760312 +-1.6330124664916832 +0.7812433101503395 +-1.7189311470579154 +3.01941960595019 +-1.2883438761804809 +0.8188503073940063 +-1.6566762222666398 +-1.2463099482518667 +-1.2877281628054718 +0.6253563191263793 +0.906848838652198 +-1.4820220936469297 +1.2813541786832119 +-1.2743860903013373 +0.5491969379608823 +1.0245519041519713 +-0.8922947319452899 +0.312413936106985 +-1.2979189463823944 +0.6605151527245473 +-1.8803498621833354 +-1.2217304763960306 +0.2815797859884185 +0.14340788687220074 +0.2690715930157924 +-1.2656545959045546 +-1.4395572633185452 +0.1198459419702773 +-1.3156679752478142 +-1.639188992789019 +0.29800042536759846 +-1.837579599235852 +-0.09697485656393492 +-1.2893648937928976 +0.8703956979695722 +-1.6170233112886911 +0.17113922943166623 +0.8203144447109572 +0.5343228542244418 +0.5302892043976104 +0.8511388985559013 +0.8135512938594791 +1.4007558790900971 +0.3714442196610966 +0.8627841231761524 +-1.3754164133077535 +-1.627291665054591 +0.8351109582584202 +-1.4144972441419932 +-1.616756663764081 +0.81426396997071 +0.255167136641571 +0.10122909661567112 +0.2911790968743873 +0.22183134792847928 +-1.9803687106498336 +0.2394009957318889 +-1.3883221534988894 +0.25575860933252464 +-1.3620597963931858 +0.9787030743294426 +0.40149845001086226 +0.7787077345981367 +0.2670353755551324 +0.8279841971461099 +0.4981557536136704 +0.38280888260408963 +0.8067542060503233 +2.514437675706497 +0.7925637096042473 +0.9772243926020584 +1.9989779521866653 +1.0347863209601935 +0.9164869346326556 +-2.1007655541629746 +0.371134569162972 +-1.776507619826484 +0.7792895110154681 +0.3087148077201193 +-0.0026878070480712674 +2.657123190193845 +0.11926416555294585 +-0.05214655954014168 +0.21731967181207393 +-0.059341194567807204 +0.10856907878493326 +0.8519873224978429 +-1.2416660149632548 +-1.525315955370011 +0.6197593875848102 +-1.5835711672921329 +2.417106481086947 +2.6520277984053835 +0.08523024513905642 +0.9854128956759985 +0.324616696460512 +1.218239817892042 +1.3620549482563749 +0.0107628637206317 +0.24143721319254893 +0.33452143996557987 +0.8494275062615848 +-1.8828960066737863 +0.968153528628499 +0.20364598675006057 +0.4926191813753995 +-1.3409510087176766 +1.0061968581851644 +0.13222323524900373 +2.530078152910036 +1.3812004422058817 +0.028759147563417676 +0.3423754215995543 +0.9391083409932266 +-2.0819063019678135 +-0.995506716516699 +0.24312436480281008 +1.0388345151974583 +-1.5437388752521732 +1.0028661881959418 +-1.5308816164291483 +-0.021759747005134968 +-1.425260107862625 +-1.377307186664081 +0.8393918630626175 +-1.4078989299420925 +-1.5061221817348844 +0.8487730077920868 +0.3203115509722594 +1.7948093362883688 +-0.7897857272114895 +0.37816921567587136 +-1.4257109845860572 +-1.382252286211398 +-1.3288455989513164 +-1.539234956154666 +-1.3447272224798388 +-1.5978974115689195 +0.6402449472732532 +-2.1213022616947748 +0.24580053632253474 +2.6796912670494937 +-1.5856510179840926 +0.18122335399874454 +0.8470664636345812 +-1.4342291609631515 +1.3248745870520844 +-1.6615098146673017 +0.28099800957108706 +2.4326980890714296 +0.30139412113536523 +-0.9676638668105783 +1.3933545195088064 +1.0377000511836618 +-1.6649519918031794 +-1.2857646673969783 +-1.3512726919884988 +0.8605151951485597 +-1.3213257509063627 +0.35449576362729274 +-1.2269664641520135 +-1.452109041041103 +0.8049410028829738 +-1.404306460565071 +-1.3652886555093753 +0.18355045966807032 +0.866778987908495 +0.2546623001554316 +1.712458884415103 +0.18965911205005048 +2.111535690088823 +1.3840460968315034 +0.9270122396495436 +1.3505939528349455 +-1.410468442451973 +-0.9398404096517022 +0.5474516087088881 +1.2062212867373365 +1.1337658820955163 +-1.4331625708647102 +-2.1402798084281263 +0.31095949506365633 +0.39638366567515665 +0.8107054375513661 +0.2422080669455131 +-2.1209338032971314 +0.8429940287132611 +-1.318494439008683 +-0.01963010594812511 +-1.3492267782542164 +0.13817189911621777 +-2.05263325190242 +-1.496164302650367 +0.4729842272904633 +-1.9748545930420174 +-1.7219321437439832 +-1.4370691995070908 +0.14166255762020644 +0.457061588524519 +-1.5613182193292052 +0.4319689898685966 +2.6418273185548387 +0.31328660073298215 +0.2807071213624213 +0.25271882755196784 +-1.3213596878640406 +0.8541980728837024 +1.2812462038890375 +1.3889906146024031 +-1.5369455689897304 +-1.5756541598796143 +0.9730065135764053 +-2.106777243808733 +-1.5293350607864087 +0.68126018469512 +0.25365451795650923 +-1.6086651234263627 +0.40404372183668724 +0.23096523768058294 +0.27954356852775847 +-1.6550424001613004 +0.3181832189121884 +-1.5157748221257752 +1.5288341435340675 +1.3433178691088534 +0.3628636477632433 +0.8228839572208375 +-1.2498762376901085 +-1.4734418611186526 +0.6821328493211172 +0.27306776674507405 +-1.3445824086332914 +0.8363035999139495 +0.1034495432751528 +-1.6497142978059065 +0.3029940062830267 +2.4524736391238875 +0.6200379130446078 +0.28875502846883966 +-1.2727231793751317 +1.3408976792127545 +-2.109273135421882 +-1.3706797836433133 +0.835920597105873 +-2.0631876457401743 +0.08234560373645469 +-1.1772245804701753 +-1.4748080660720195 +1.3498473824296406 +-0.07621271067041906 +1.3858302111779863 +-1.6446528429751233 +0.616392114162664 +-1.2577375884920357 +0.8422813526020302 +0.9939068313690375 +1.4009400462327672 +-0.9135392674515097 +-1.221439588187365 +0.9517522817965633 +0.3255620831386756 +0.6257538663448893 +-1.3453822057630176 +0.8618726734556664 +1.6332315712474692 +0.2418223006994542 +0.11606439525762292 +0.1719634126895524 +-1.8889211660919725 +0.37643358269749927 +-2.0398268985159116 +0.319686141323628 +-2.1305767474144 +0.3067125272171369 +-2.071386084779621 +1.2530203513748401 +0.3298914693109838 +0.3143871277891008 +-1.8258471081530012 +0.34084341036724813 +0.195074480868044 +-1.4251243600319143 +-1.5215053198364903 +0.8222343068881508 +-0.1427824772235694 +0.27111605230903135 +-1.2985928373991367 +-1.6532873746356842 +0.2315470140979144 +0.7160116293570513 +0.07669718498195181 +0.3871285725027756 +-1.6062143902683534 +1.3702665304007906 +-2.1264315904409132 +1.3177211611873134 +0.3711297210261609 +-1.3282634346830402 +-1.4709489491703875 +0.011911872144861299 +1.3487981253899266 +0.6062069544244425 +0.921082968329574 +0.9907361498945811 +1.3287366128358031 +-1.4153699087679903 +0.9170250778186873 +2.463959844856735 +-1.5408396894391385 +-1.3419206360798959 +-1.5323021205147993 +0.2929244261263817 +1.3361707458219365 +0.2586250217407166 +0.920103644693733 +-0.880353770979562 +0.864655503985235 +-1.4592261543611589 +0.294669755378376 +1.5171419967065045 +-1.3790331233688307 +0.09791297103688187 +2.481276419918605 +1.3089969389957472 +-1.3260817731180472 +0.9096849986866888 +0.8828263207532208 +0.9585348251952855 +0.9450957899549295 +-1.2598368317312403 +0.3149398153855657 +-1.4348982038430829 +-2.755118579556515 +0.8460483549042512 +0.6891141663290944 +-1.3875173627882476 +0.3822368024603803 +0.8348200700497544 +1.1388564257471665 +-1.5748978505370834 +-1.3970827367165388 +1.7300576210393792 +1.3733423066882875 +-1.7023844561216468 +-1.286037617499443 +-1.243432676017218 +0.6664248860531681 +0.5809037527054459 +1.7203128660490774 +-1.4338267656078305 +-1.562069680534925 +-1.523948780789282 +0.8623187020422872 +1.3351943310681822 +1.8281305805910273 +-1.2505623460116149 +-1.446766442756644 +0.32196961376165395 +-1.3745631412290007 +-0.0033552015614866544 +-1.4795883289677594 +0.1541125729510993 +-1.3543367144531109 +0.27932166930591457 +0.19576839468981608 +0.31663666326944906 +0.41684280301797905 +0.46134869894383446 +2.5048432129573395 +-2.003119230650703 +2.5998473019075643 +0.7870562261868429 +0.30656373789840435 +1.166679882906043 +0.383390659021421 +1.4407692975213189 +0.29350620254371307 +0.8402160463205035 +-1.5637228951875082 +2.541184846493311 +-1.3977178426387922 +-1.4568554154605333 +0.3889175349860698 +2.6275592519197852 +0.16115206760080975 +-1.5637713765556194 +-1.3180871955165507 +-1.4007673206929712 +1.319817980358112 +-1.3350944594498735 +1.324389613770311 +0.8451514495941985 +-1.506510032679772 +1.3123809384898917 +0.8416510948165877 +-1.4999505035743599 +0.6357119393548789 +1.3412855301576423 +-2.111271466632618 +-2.738251911590715 +0.3602117169275741 +0.8537181073394041 +1.7619098798882757 +1.0693050550551926 +0.8339571016973794 +0.28391173979455536 +0.31607912753617307 +0.13235413494290332 +-1.403138059593597 +0.15061706631029953 +1.2766938034234188 +0.56635934227216 +0.1375998189725085 +-1.42922103563729 +-2.107950492917018 +0.3246312408709453 +2.6695683573879263 +0.09364176250630686 +-1.2796560150149978 +-2.0943951023931953 +0.32898971586412 +-1.4306803248174296 +-1.1556881871279274 +0.32541179089753164 +2.46806602281026 +0.28721816909972236 +1.3326389557252425 +0.3558290012503439 +0.26798289541349285 +0.3545927263635146 +0.4118977034706618 +-1.4864872276499483 +0.2346401253833932 +0.8204114074471789 +-0.8458350368845631 +1.3241813034880978 +0.14631676895885795 +0.8202223301115462 +1.6568992365599504 +-1.716327697590357 +-1.2383401931108435 +0.26395196054327574 +0.006321631032091572 +-1.1970437637539328 +-1.1949687611987843 +1.3177817628974517 +-2.1650760889621545 +2.0912709630321253 +-1.6776638065218716 +-1.5036690245084705 +0.5774130942014574 +-1.2835296763270632 +-0.0038833575856873836 +0.25801784108649506 +-2.082531711616445 +-1.5634513995260872 +-1.9521992497237686 +-1.4055039503574112 +-0.26092672317315224 +0.30077840776035614 +0.2655809345118038 +-0.11073144476541802 +-1.2629057023326635 +0.9271964688483654 +0.8369629465202586 +-1.4858569698645059 +-1.3271580594901105 +0.24728406618672993 +0.26811694639631967 +2.5967590387588966 +1.5005274318548807 +0.27577656622553737 +0.4239307790358005 +-0.14434842541355328 +-1.6479883611011568 +-1.3406116391408998 +-1.948955846197146 +1.0052223826861342 +0.36099226695416053 +0.26589266970875725 +0.27721646285843266 +-1.178679021513504 +-1.1475539831862718 +2.024828217663426 +-1.2421944618756642 +-1.5047112769601196 +2.37091343355083 +-1.5040579905248244 +0.16144295580947549 +-0.11796486488757228 +-1.5937183176377554 +1.0070355858534836 +0.3489204062945331 +0.15707963267948966 +0.8207168400662779 +-1.71195952632356 +-1.652506824609098 +-0.04453013660991088 +-1.2409581869888346 +0.3502294032335288 +-1.5995942594528032 +0.6690428799311597 +-1.9176320342606585 +0.1439896632895322 +-0.0525538030322737 +-1.4084962203972196 +0.8330553482505157 +0.8213083127572315 +0.36302848441482055 +0.8435758051305926 +0.12203245167208128 +-1.0741144067717991 +0.9821791884229978 +0.5287717375757377 +-2.134494041957765 +0.41131592705333025 +0.3027419031688497 +0.02394009957318888 +-1.4087715945680896 +2.9071367574052216 +1.4136573529208392 +0.3209127199368351 +0.30703735238348023 +-1.527914556700758 +-0.2883574812503298 +-1.4994899305773064 +-1.5367672545178184 +-1.4286993761164162 +2.5605725456008805 +1.3340763507195434 +-1.4659214312972817 +0.5730594673450937 +0.6123827050198882 +0.5871820898758144 +0.14989469392544635 +0.3898047440225002 +0.3438298626428829 +0.2542072055529741 +0.09894077604083411 +0.19259223482076315 +0.06973075175398456 +1.3309008211082758 +-1.6817895709481137 +-1.5990124830354713 +0.9203508996710987 +-0.4365242992163017 +2.044347786092258 +0.2719584645613273 +0.9517124301119761 +-1.4323626282908795 +0.23353959832727458 +0.8432849169219269 +0.4235332318172906 +-1.5277933532804806 +1.676679634749219 +-1.4127616111636212 +1.4136439720632406 +-1.7819811662862104 +0.3993895104980357 +-1.7258154935726517 +0.840957811252601 +1.0675403332559539 +-1.2614948945206348 +-1.6841215247542505 +1.3482424125560917 +-1.312191861154259 +-1.6128975468624487 +-1.6497870198580733 +0.3308126153050919 +-1.4858909068221837 +-1.2465723294160833 +-1.0946656587140324 +-1.2905497784295294 +0.8678213373228805 +-1.563150815043799 +-1.4886398003940744 +2.4157032363683437 +-1.4844267695052324 +-1.385209649666166 +-1.4386167247771928 +-1.6825366688307035 +-1.3074794721738743 +2.0203407822310764 +0.2667444873464667 +1.0506494246060976 +-1.6452782526237544 +0.3505202914421945 +-1.2466402033314388 +-1.553120019981643 +-1.2917603581912598 +-1.4822014747089403 +0.16609716714812703 +1.0505912469643643 +0.2814488862945189 +1.357572995097131 +0.6084356913978711 +-0.013962634015954637 +0.360618078252732 +-1.525315955370011 +-1.8329593248548781 +1.2980556182679812 +-1.2917618126323032 +0.4098614860100017 +-0.2591813939211579 +0.25947228212982365 +1.6715163690454025 +-1.2533403284043723 +1.1506567907453726 +-1.378679209381621 +0.843192802322516 +-0.10205812801036843 +0.33038113112890444 +0.040033392754883705 +0.6089523088564613 +-1.221439588187365 +1.7809582094190695 +2.533781741582768 +-1.3463372887148037 +-1.6505578736110373 +1.0002724350020056 +0.2911790968743873 +0.36959770979385476 +-1.5807980330361864 +0.4805240496590788 +-1.9803727423604056 +0.30535989704684124 +-1.5415087323190695 +-1.3530761988822262 +0.8208331953497443 +-1.2945591875723053 +-1.3452076728378184 +0.3209224162104573 +0.3361116288396191 +0.297011405458135 +-1.6407016114740804 +0.2634283617676775 +-1.1658072182800459 +0.7852963525244152 +0.5245831412964237 +0.11135685441404933 +0.37780075727822815 +0.10488943990804811 +-1.6934832769364758 +0.34004831593022855 +0.9468265747964905 +0.6451900468205705 +-1.4723597569824165 +0.47763843862911487 +-1.406861428664518 +0.1736146880874115 +-1.2910636809315055 +0.4307569556658227 +0.8648882145521678 +0.04328804395890825 +-1.4280962678971156 +2.389627241641658 +-1.9676357173302963 +0.8818906303486792 +0.4423246100970962 +0.12107736872029551 +0.7641293872071729 +0.815170571554385 +-1.9933987163444569 +1.0035740161703617 +-1.4941763726323454 +-1.2550953539299892 +1.0268353765899971 +1.3377948716536536 +0.7478735844795703 +0.2530824378128 +-1.25673887230895 +-1.6745071846441675 +0.838048929165944 +-1.9411697384785265 +0.874002711757027 +-1.4451907982930379 +-1.4305300325762857 +1.0100802157708517 +0.7962967749487907 +0.4655229447381875 +-1.3547197172611876 +-1.2883438761804809 +-1.748150867618387 +0.3758275655961123 +1.4824438815494947 +-0.10544697564132408 +-1.5452224051163685 +0.330158116835594 +0.2548180707911721 +0.8784484532128016 +0.3592469377021662 +-1.9499303216961759 +-1.409872121624208 +-1.5513262093615376 +-1.705530896912048 +-1.4779254180415538 +0.20245819323134223 +-1.7423088627610168 +-1.3161237001080577 +1.310516073576683 +0.87379424187415 +-1.7050654757781827 +1.335775194096538 +0.8491026810952413 +-2.36533322808126 +0.26412649346847517 +-1.4951653925418091 +0.2484185302005262 +-1.3980087308474578 +-1.41952961015191 +0.3193855568413402 +-1.3285155377972169 +0.31648637102830507 +-2.602639828710755 +0.2584347797662666 +-1.295305800641214 +0.2151409191291677 +-1.5075960153254573 +0.899271200816456 +-1.4593667503286807 +-1.517059578380716 +2.6720990848033184 +-1.3428708708948702 +-1.5877454130864859 +0.333159113521662 +-1.5196727241218957 +0.7483099167925689 +-1.7054727192703145 +0.3821543841345917 +-1.491092957620489 +1.037598240310629 +-1.2234661093744028 +0.44428325736877883 +1.204757149420386 +0.38862664677740405 +0.4174245794353105 +1.059705744169224 +-1.671235177110359 +0.5858488522527634 +0.9259601939615361 +-0.21496638620396824 +-0.031549686630516056 +2.0943951023931953 +0.27432697331901984 +1.0465236601798555 +-1.4387185356502257 +-1.3261399507597802 +-1.2911509473941052 +-1.5156972519367977 +1.813591018294552 +0.1605702911834783 +-1.3205064157852875 +0.24684288573692023 +0.9290242164261484 +0.4943014848488496 +-1.6258420721480733 +0.14934685446579257 +-1.8135085999687637 +0.32977511402751747 +-1.232289718370596 +-1.334929622798296 +-1.3587822135020446 +0.9863388898069176 +0.890777265123417 +0.7938969472272985 +-1.9642468696993405 +-1.512046604918043 +-1.7385903418269069 +0.17075137848677857 +1.0443856318461624 +-1.0832919297552028 +0.032307983709139475 +0.8289780651923846 +0.2953969759000403 +0.32259502341028523 +0.1970777309983886 +0.8408560003795681 +0.0061959188445798695 +-1.4369392694405536 +-0.11606439525762292 +0.5864306286700948 +0.15580942083498267 +-1.472141590825917 +-2.1334565406801906 +0.9965005845629736 +1.6449291867733558 +0.8288423173616738 +-1.2554735086012543 +0.11484266478122689 +-1.5885453556603168 +0.44397685512231755 +2.3772361796543886 +0.1900130260372604 +0.5550147021341968 +-1.416625576202064 +1.498074274628466 +0.3517565663290238 +-1.5728325442555564 +0.301941960595019 +-1.5955412170787273 +-1.5850934822508167 +-1.3551124163428863 +-1.5916966445875287 +0.32812189937493397 +0.1852085224574649 +0.3909537524467298 +0.3720460188834579 +0.21234354418916565 +-1.381592939605089 +1.7526014572109725 +-1.2674387102510376 +0.28299170887191377 +0.32172235878428806 +-1.5795908469702236 +0.8833741602128744 +1.2863600185973807 +0.8109866294864095 +-0.3994953938059901 +-2.072316687355467 +0.4107341506359989 +0.40840704496667307 +-1.491868659510264 +-1.3353174737431839 +1.5965108444409462 +0.1989675347273536 +1.033074928665877 +0.2723440853632818 +0.37117335425746073 +-0.050352748920036404 +0.39066286423806407 +-1.238820158655142 +0.9263480449064238 +0.27801310869923185 +-0.761894396137258 +0.3611861924266044 +0.8252207591637856 +-1.2604573932430603 +-1.5406603083771275 +2.768964858289004 +0.40375283362802156 +1.6583536776032788 +-1.3121094428284705 +-1.308778772839248 +0.3566240956873636 +2.658249373589968 +-1.253117314111062 +-1.7578374449669556 +0.15038920388017807 +0.6994503940103498 +-1.2854504402577038 +-1.7170112848807213 +0.8454714266237308 +0.1817891315645994 +-1.7260482118966034 +0.2587935429762703 +1.0460388464987458 +-1.5470792415150183 +0.8148360501144192 +-1.578553345692649 +-1.46790431925302 +-1.8306128266383082 +0.3723369070921236 +-1.3381536337776747 +-1.4127470667531878 +0.0756309342530876 +1.3415375557016302 +-1.6987483535133252 +0.8136385603220788 +0.8023569459626598 +0.3577924966588375 +-1.669698317741242 +0.29602723368548267 +2.1317048118876056 +-1.5424638152708554 +-1.2849058684422607 +0.3010692959690218 +0.8530926976907728 +0.6263404908990317 +1.356469894649993 +-1.7966855652342628 +1.3475493229175775 +0.2679904585069182 +0.8145451619057534 +0.2583085353696883 +0.808867993699961 +-1.476921853721657 +0.8445793694504894 +-1.6057707857501384 +2.021929031850391 +1.3154546572281263 +-1.395032653584599 +0.9014334698342046 +-0.08930268006037652 +-1.5102770349819932 +-2.0883494757897596 +-1.3739425797171807 +0.02516667818639601 +1.3592624214531837 +0.24666122605060853 +-1.6513675124584901 +0.3340123856004148 +0.8384513245212647 +-0.8456120225912527 +-0.10958264384144392 +2.4232325867614475 +-1.4675310127185652 +1.8286590275034365 +1.9643680731196178 +0.24689796057109425 diff --git a/data/samples/ssd_hdd_benchmarks_f.csv b/data/samples/ssd_hdd_benchmarks_f.csv new file mode 100644 index 0000000..a339531 --- /dev/null +++ b/data/samples/ssd_hdd_benchmarks_f.csv @@ -0,0 +1,1024 @@ +465.8 +3600.0 +953.9 +1800.0 +1800.0 +953.9 +1900.0 +1900.0 +1800.0 +953.9 +3600.0 +953.9 +1800.0 +931.5 +931.5 +1800.0 +1800.0 +931.5 +931.5 +931.5 +953.9 +931.5 +953.9 +953.9 +953.9 +953.9 +931.5 +953.9 +931.5 +1800.0 +953.9 +1800.0 +931.5 +476.9 +3600.0 +1900.0 +1800.0 +953.9 +931.5 +953.9 +465.8 +953.9 +953.9 +931.5 +931.5 +1900.0 +476.9 +476.9 +1800.0 +1800.0 +931.5 +931.5 +465.8 +1900.0 +931.5 +476.9 +931.5 +1800.0 +931.5 +465.8 +465.8 +931.5 +953.9 +476.9 +476.9 +931.5 +953.9 +1900.0 +931.5 +1800.0 +1800.0 +1900.0 +931.5 +1800.0 +931.5 +1800.0 +1800.0 +465.8 +476.9 +1800.0 +3700.0 +1900.0 +931.5 +953.9 +465.8 +931.5 +1800.0 +1900.0 +476.9 +3600.0 +1800.0 +931.5 +1800.0 +3600.0 +931.5 +1800.0 +465.8 +931.5 +1900.0 +1900.0 +931.5 +476.9 +953.9 +1800.0 +931.5 +953.9 +465.8 +931.5 +1800.0 +1800.0 +931.5 +931.5 +476.9 +476.9 +476.9 +1800.0 +931.5 +931.5 +931.5 +1800.0 +931.5 +931.5 +465.8 +476.9 +3600.0 +1800.0 +953.9 +953.9 +931.5 +465.8 +953.9 +1900.0 +953.9 +238.5 +1900.0 +953.9 +931.5 +3500.0 +465.8 +931.5 +953.9 +1800.0 +476.9 +1800.0 +931.5 +476.9 +931.5 +931.5 +1800.0 +953.9 +953.9 +931.5 +953.9 +476.9 +1800.0 +953.9 +1900.0 +953.9 +476.9 +476.9 +3600.0 +476.9 +953.9 +465.8 +953.9 +931.5 +931.5 +931.5 +1900.0 +931.5 +1900.0 +1800.0 +476.9 +931.5 +953.9 +476.9 +931.5 +953.9 +953.9 +476.9 +1800.0 +1900.0 +931.5 +931.5 +1800.0 +476.9 +931.5 +465.8 +476.9 +894.3 +1800.0 +476.9 +465.8 +953.9 +953.9 +953.9 +953.9 +476.9 +931.5 +953.9 +1800.0 +349.3 +931.5 +1900.0 +953.9 +1800.0 +931.5 +931.5 +953.9 +476.9 +465.8 +465.8 +465.8 +953.9 +447.1 +476.9 +953.9 +931.5 +953.9 +3600.0 +476.9 +232.9 +260.8 +931.4 +1800.0 +953.9 +953.9 +476.9 +953.9 +1900.0 +931.5 +931.5 +476.9 +1800.0 +953.9 +953.9 +953.9 +465.8 +465.8 +465.8 +953.9 +953.9 +931.5 +465.8 +953.9 +953.9 +953.9 +1900.0 +447.1 +476.9 +953.9 +465.8 +465.9 +953.9 +953.9 +476.9 +465.8 +953.9 +953.9 +476.9 +1800.0 +1800.0 +953.9 +953.9 +465.8 +1900.0 +953.9 +476.9 +931.5 +953.9 +1700.0 +465.8 +931.5 +3600.0 +2900.0 +476.9 +953.9 +476.9 +1800.0 +1900.0 +1800.0 +3700.0 +1900.0 +953.9 +1900.0 +465.8 +476.9 +476.9 +953.9 +1900.0 +465.8 +953.9 +931.5 +3600.0 +953.9 +953.9 +953.9 +476.9 +476.9 +476.9 +1900.0 +953.9 +1800.0 +931.5 +476.9 +476.9 +476.9 +931.5 +1900.0 +476.9 +465.8 +931.5 +476.9 +953.9 +476.9 +465.8 +931.5 +465.8 +953.9 +476.9 +1800.0 +476.9 +931.5 +931.5 +953.9 +465.8 +1900.0 +238.5 +931.5 +476.9 +476.9 +465.8 +465.8 +953.9 +953.9 +1900.0 +476.9 +476.9 +465.8 +465.8 +953.9 +953.9 +465.8 +953.9 +476.9 +238.5 +931.5 +931.5 +476.9 +1800.0 +953.9 +447.1 +476.9 +931.5 +953.9 +354.0 +1900.0 +476.9 +953.9 +1800.0 +465.8 +476.9 +953.9 +3600.0 +1900.0 +953.9 +1900.0 +953.9 +476.9 +465.8 +238.5 +476.9 +953.9 +238.5 +465.8 +953.9 +953.9 +953.9 +476.9 +1900.0 +953.9 +476.9 +1800.0 +953.9 +476.9 +1800.0 +953.9 +476.9 +465.8 +953.9 +953.9 +476.9 +953.9 +1900.0 +953.9 +1900.0 +465.8 +1900.0 +476.9 +953.9 +476.9 +476.9 +953.9 +465.8 +1900.0 +953.9 +894.3 +476.9 +476.9 +476.9 +953.9 +1900.0 +476.9 +1800.0 +476.9 +1900.0 +1800.0 +1900.0 +953.9 +953.9 +476.9 +931.5 +476.9 +931.5 +953.9 +238.5 +953.9 +476.9 +465.8 +894.3 +476.9 +476.9 +953.9 +476.9 +476.9 +238.5 +953.9 +953.9 +476.9 +476.9 +953.9 +465.8 +3600.0 +476.9 +953.9 +238.5 +894.3 +894.3 +931.5 +476.9 +465.8 +476.9 +476.9 +953.9 +931.5 +476.9 +465.8 +953.9 +238.5 +465.8 +476.9 +953.9 +2900.0 +465.8 +953.9 +465.8 +953.9 +465.8 +476.9 +953.9 +476.9 +931.5 +953.9 +238.5 +953.9 +953.9 +953.9 +953.9 +953.9 +931.5 +476.9 +1900.0 +931.5 +953.9 +476.9 +237.5 +465.8 +953.9 +476.9 +1900.0 +953.9 +931.5 +953.9 +953.9 +238.5 +476.9 +476.9 +476.9 +476.9 +476.9 +953.9 +232.9 +476.9 +953.9 +476.9 +931.5 +232.9 +232.9 +953.9 +953.9 +476.9 +476.9 +476.9 +953.9 +476.9 +476.9 +1900.0 +238.5 +476.9 +931.5 +476.9 +476.9 +931.5 +953.9 +476.9 +476.9 +953.9 +953.9 +447.1 +894.3 +476.9 +476.9 +238.5 +465.8 +953.9 +953.9 +476.9 +238.5 +476.9 +953.9 +476.9 +476.9 +476.9 +476.9 +476.9 +953.9 +931.5 +476.9 +476.9 +476.9 +238.5 +953.9 +476.9 +1800.0 +953.9 +1900.0 +476.9 +953.9 +238.5 +238.5 +931.5 +1800.0 +953.9 +476.9 +476.9 +953.9 +465.8 +476.9 +953.9 +476.9 +476.9 +232.9 +476.9 +465.8 +476.9 +476.9 +1900.0 +476.9 +1900.0 +238.5 +953.9 +447.1 +953.9 +476.9 +953.9 +953.9 +953.9 +953.9 +476.9 +953.9 +931.5 +953.9 +476.9 +931.5 +1800.0 +931.5 +476.9 +953.9 +953.9 +953.9 +894.3 +238.5 +953.9 +953.9 +476.9 +476.9 +476.9 +465.8 +953.9 +1900.0 +953.9 +476.9 +238.5 +931.5 +476.9 +476.9 +476.9 +447.1 +476.9 +238.5 +238.5 +238.5 +476.9 +476.9 +476.9 +476.9 +476.9 +953.9 +476.9 +953.9 +476.9 +953.9 +447.1 +953.9 +476.9 +476.9 +476.9 +931.5 +238.5 +953.9 +1500.0 +953.9 +476.9 +953.9 +953.9 +953.9 +953.9 +953.9 +1900.0 +465.8 +894.3 +476.9 +953.9 +953.9 +931.5 +931.5 +238.5 +476.9 +232.9 +3600.0 +476.9 +465.8 +953.9 +465.8 +931.5 +238.5 +953.9 +238.5 +953.9 +953.9 +476.9 +476.9 +476.9 +953.9 +476.9 +238.5 +465.8 +476.9 +465.8 +476.9 +447.1 +1800.0 +476.9 +476.9 +476.9 +476.9 +476.9 +476.9 +238.5 +953.9 +476.9 +238.5 +953.9 +465.8 +238.5 +1800.0 +476.9 +238.5 +476.9 +223.6 +476.9 +238.5 +476.9 +953.9 +476.9 +476.9 +1800.0 +476.9 +476.9 +476.9 +238.5 +465.8 +465.8 +476.9 +465.8 +1800.0 +476.9 +953.9 +232.9 +476.9 +931.5 +232.9 +476.9 +953.9 +476.9 +931.5 +953.9 +476.9 +953.9 +476.9 +238.5 +476.9 +238.5 +953.9 +476.9 +238.5 +476.9 +931.5 +476.9 +476.9 +476.9 +476.9 +476.9 +476.9 +953.9 +953.9 +447.1 +953.9 +238.5 +476.9 +238.5 +931.5 +931.5 +953.9 +447.1 +931.5 +447.1 +476.9 +476.9 +238.5 +476.9 +476.9 +465.8 +232.9 +1900.0 +238.5 +476.9 +476.9 +476.9 +953.9 +476.9 +476.9 +953.9 +238.5 +238.5 +476.9 +238.5 +238.5 +476.9 +232.9 +465.8 +238.5 +476.9 +238.5 +238.5 +1900.0 +11600.0 +931.5 +232.9 +953.9 +238.5 +953.9 +953.9 +238.5 +476.9 +953.9 +476.9 +238.5 +447.1 +931.5 +465.8 +476.9 +953.9 +238.5 +476.9 +476.9 +238.5 +232.9 +476.9 +238.5 +476.9 +476.9 +232.9 +476.9 +953.9 +476.9 +238.5 +232.9 +953.9 +953.9 +931.5 +953.9 +1800.0 +476.9 +238.5 +476.9 +931.5 +476.9 +238.5 +238.5 +476.9 +953.9 +465.8 +465.8 +238.5 +953.9 +1900.0 +232.9 +476.9 +238.5 +1800.0 +953.9 +953.9 +931.5 +465.8 +232.9 +953.9 +476.9 +238.5 +476.9 +238.5 +465.8 +476.9 +238.5 +476.9 +476.9 +238.5 +476.9 +953.9 +238.5 +476.9 +223.6 +953.9 +476.9 +238.5 +894.3 +465.8 +476.9 +476.9 +476.9 +238.5 +476.9 +476.9 +238.5 +447.1 +119.2 +1100.0 +953.9 +476.9 +238.5 +476.9 +232.9 +232.9 +476.9 +953.9 +476.9 +1900.0 +238.5 +476.9 +476.9 +238.5 +238.5 +953.9 +476.9 +476.9 +232.9 +238.5 +238.5 +476.9 +238.5 +476.9 +238.5 +465.8 +953.9 +232.9 +476.9 +953.9 +931.5 +1800.0 +476.9 +953.9 +1800.0 +476.9 +476.9 +238.5 +238.5 +238.5 +931.5 +465.8 +1800.0 +745.2 +476.9 +465.8 +238.5 +476.9 +953.9 +238.5 +953.9 +238.5 +953.9 +476.9 +1800.0 +931.5 +476.9 +238.5 +238.5 +238.5 +238.5 +476.9 +223.6 +223.6 +953.9 +465.8 +238.5 +476.9 +238.5 +238.5 +953.9 +1500.0 +238.5 +1900.0 +476.9 +238.5 +476.9 +372.6 +465.8 +476.9 +465.9 +1800.0 +931.5 +1100.0 +238.5 +476.9 +238.5 +238.5 +953.9 +238.5 +476.9 +476.9 +931.5 +953.9 +465.8 +953.9 +238.5 +238.5 +238.5 +931.5 +953.9 +476.9 +476.9 +931.5 +476.9 +476.9 +476.9 +476.9 +476.9 +232.9 +238.5 +238.5 +238.5 +476.9 +232.9 +476.9 +476.9 +476.9 +953.9 +476.9 +238.5 +476.9 +238.5 +476.9 +931.5 +238.5 +238.5 +238.5 +238.5 +465.8 +476.9 +476.9 +476.9 +953.9 +476.9 +476.9 +476.9 +476.9 +238.5 +476.9 +238.5 +476.9 +238.5 +931.5 +476.9 +476.9 +238.5 diff --git a/data/samples/stocks_de.csv b/data/samples/stocks_de.csv new file mode 100644 index 0000000..8dced8f --- /dev/null +++ b/data/samples/stocks_de.csv @@ -0,0 +1,1024 @@ +15.455 +15.458 +15.462 +15.448 +15.45 +15.495 +15.435 +15.438 +15.435 +15.438 +15.44 +15.435 +15.472 +15.475 +15.452 +15.418 +15.43 +15.415 +15.418 +15.415 +15.412 +15.44 +15.412 +15.418 +15.39 +15.39 +15.392 +15.387 +15.39 +15.387 +15.39 +15.387 +15.39 +15.387 +15.39 +15.387 +15.39 +15.387 +15.39 +15.387 +15.39 +15.387 +15.392 +15.375 +15.378 +15.407 +15.392 +15.372 +15.387 +15.37 +15.372 +15.375 +15.39 +15.358 +15.355 +15.302 +15.31 +15.312 +15.368 +15.36 +15.362 +15.37 +15.368 +15.355 +15.352 +15.365 +15.352 +15.348 +15.348 +15.35 +15.335 +15.302 +15.3 +15.298 +15.292 +15.298 +15.295 +15.3 +15.295 +15.3 +15.295 +15.3 +15.295 +15.3 +15.295 +15.3 +15.295 +15.3 +15.295 +15.292 +15.288 +15.295 +15.3 +15.31 +15.32 +15.317 +15.317 +15.312 +15.31 +15.312 +15.315 +15.317 +15.315 +15.295 +15.272 +15.275 +15.278 +15.275 +15.27 +15.272 +15.268 +15.27 +15.272 +15.26 +15.27 +15.235 +15.247 +15.25 +15.252 +15.25 +15.253 +15.248 +15.255 +15.245 +15.25 +15.238 +15.227 +15.258 +15.24 +15.245 +15.23 +15.232 +15.225 +15.227 +15.238 +15.247 +15.24 +15.245 +15.242 +15.248 +15.245 +15.247 +15.24 +15.245 +15.255 +15.252 +15.258 +15.272 +15.268 +15.262 +15.255 +15.247 +15.252 +15.255 +15.27 +15.272 +15.288 +15.28 +15.29 +15.29 +15.278 +15.272 +15.262 +15.27 +15.23 +15.227 +15.225 +15.227 +15.23 +15.232 +15.238 +15.22 +15.218 +15.22 +15.212 +15.21 +15.208 +15.21 +15.205 +15.202 +15.205 +15.208 +15.198 +15.2 +15.198 +15.182 +15.18 +15.182 +15.177 +15.175 +15.178 +15.185 +15.19 +15.198 +15.188 +15.185 +15.182 +15.177 +15.18 +15.185 +15.178 +15.17 +15.172 +15.148 +15.145 +15.148 +15.145 +15.148 +15.145 +15.14 +15.135 +15.122 +15.125 +15.112 +15.118 +15.122 +15.12 +15.118 +15.115 +15.112 +15.11 +15.118 +15.115 +15.118 +15.112 +15.1 +15.112 +15.11 +15.108 +15.11 +15.125 +15.12 +15.13 +15.135 +15.142 +15.148 +15.152 +15.16 +15.16 +15.15 +15.145 +15.135 +15.14 +15.142 +15.14 +15.148 +15.15 +15.132 +15.135 +15.137 +15.14 +15.142 +15.152 +15.142 +15.148 +15.145 +15.132 +15.125 +15.122 +15.125 +15.11 +15.105 +15.11 +15.108 +15.105 +15.1 +15.102 +15.11 +15.12 +15.118 +15.122 +15.12 +15.122 +15.12 +15.118 +15.118 +15.118 +15.12 +15.118 +15.12 +15.125 +15.135 +15.137 +15.142 +15.145 +15.135 +15.137 +15.142 +15.14 +15.13 +15.14 +15.148 +15.15 +15.155 +15.168 +15.17 +15.148 +15.152 +15.145 +15.115 +15.11 +15.115 +15.12 +15.122 +15.12 +15.135 +15.14 +15.145 +15.142 +15.14 +15.13 +15.125 +15.135 +15.142 +15.145 +15.142 +15.142 +15.145 +15.142 +15.14 +15.137 +15.152 +15.142 +15.148 +15.152 +15.155 +15.157 +15.165 +15.168 +15.168 +15.163 +15.16 +15.162 +15.168 +15.162 +15.148 +15.115 +15.118 +15.128 +15.132 +15.128 +15.125 +15.122 +15.12 +15.122 +15.125 +15.122 +15.12 +15.118 +15.108 +15.102 +15.115 +15.115 +15.11 +15.118 +15.13 +15.137 +15.135 +15.137 +15.142 +15.137 +15.142 +15.125 +15.1 +15.102 +15.108 +15.11 +15.115 +15.12 +15.125 +15.125 +15.14 +15.148 +15.16 +15.168 +15.162 +15.148 +15.142 +15.122 +15.128 +15.13 +15.148 +15.15 +15.152 +15.14 +15.132 +15.135 +15.137 +15.14 +15.137 +15.14 +15.142 +15.145 +15.148 +15.15 +15.145 +15.142 +15.14 +15.1 +15.102 +15.102 +15.108 +15.11 +15.108 +15.11 +15.115 +15.118 +15.12 +15.122 +15.12 +15.125 +15.132 +15.14 +15.137 +15.14 +15.137 +15.14 +15.137 +15.14 +15.145 +15.14 +15.137 +15.142 +15.14 +15.137 +15.14 +15.137 +15.135 +15.137 +15.14 +15.142 +15.14 +15.135 +15.132 +15.12 +15.115 +15.12 +15.115 +15.067 +15.065 +15.082 +15.08 +15.082 +15.09 +15.102 +15.1 +15.102 +15.105 +15.11 +15.112 +15.108 +15.105 +15.108 +15.11 +15.108 +15.09 +15.092 +15.095 +15.092 +15.085 +15.082 +15.062 +15.065 +15.062 +15.073 +15.08 +15.085 +15.087 +15.09 +15.092 +15.098 +15.1 +15.11 +15.105 +15.11 +15.112 +15.108 +15.112 +15.11 +15.108 +15.105 +15.1 +15.085 +15.08 +15.078 +15.073 +15.08 +15.082 +15.08 +15.085 +15.095 +15.098 +15.105 +15.098 +15.1 +15.092 +15.095 +15.08 +15.075 +15.078 +15.09 +15.085 +15.082 +15.08 +15.078 +15.08 +15.078 +15.08 +15.082 +15.085 +15.08 +15.085 +15.082 +15.078 +15.078 +15.082 +15.08 +15.062 +15.06 +15.055 +15.06 +15.062 +15.06 +15.062 +15.082 +15.082 +15.092 +15.098 +15.09 +15.085 +15.082 +15.085 +15.09 +15.078 +15.073 +15.075 +15.073 +15.078 +15.092 +15.102 +15.098 +15.1 +15.092 +15.082 +15.09 +15.092 +15.098 +15.102 +15.108 +15.105 +15.108 +15.102 +15.1 +15.098 +15.102 +15.092 +15.1 +15.105 +15.108 +15.112 +15.108 +15.105 +15.11 +15.115 +15.115 +15.108 +15.115 +15.12 +15.115 +15.108 +15.1 +15.087 +15.085 +15.082 +15.092 +15.095 +15.098 +15.102 +15.09 +15.092 +15.095 +15.092 +15.095 +15.092 +15.08 +15.085 +15.09 +15.092 +15.098 +15.102 +15.098 +15.095 +15.098 +15.095 +15.088 +15.092 +15.09 +15.092 +15.095 +15.1 +15.098 +15.095 +15.098 +15.092 +15.095 +15.092 +15.095 +15.085 +15.078 +15.075 +15.08 +15.082 +15.08 +15.088 +15.082 +15.085 +15.09 +15.092 +15.095 +15.1 +15.098 +15.095 +15.092 +15.088 +15.088 +15.095 +15.088 +15.09 +15.095 +15.092 +15.09 +15.085 +15.09 +15.082 +15.09 +15.085 +15.095 +15.098 +15.095 +15.098 +15.1 +15.095 +15.092 +15.098 +15.112 +15.105 +15.108 +15.102 +15.095 +15.09 +15.08 +15.088 +15.067 +15.07 +15.078 +15.075 +15.078 +15.075 +15.078 +15.08 +15.085 +15.087 +15.092 +15.09 +15.082 +15.085 +15.092 +15.09 +15.092 +15.09 +15.092 +15.095 +15.1 +15.098 +15.09 +15.092 +15.095 +15.1 +15.098 +15.1 +15.1 +15.098 +15.078 +15.08 +15.078 +15.078 +15.075 +15.078 +15.08 +15.078 +15.075 +15.072 +15.08 +15.085 +15.078 +15.08 +15.075 +15.078 +15.08 +15.082 +15.088 +15.095 +15.092 +15.09 +15.088 +15.082 +15.085 +15.09 +15.088 +15.09 +15.092 +15.1 +15.102 +15.1 +15.102 +15.105 +15.098 +15.095 +15.092 +15.09 +15.088 +15.085 +15.08 +15.075 +15.085 +15.09 +15.095 +15.088 +15.085 +15.08 +15.085 +15.088 +15.085 +15.085 +15.082 +15.075 +15.078 +15.08 +15.085 +15.082 +15.08 +15.075 +15.078 +15.075 +15.07 +15.067 +15.07 +15.072 +15.07 +15.072 +15.067 +15.07 +15.072 +15.07 +15.072 +15.067 +15.07 +15.072 +15.078 +15.075 +15.078 +15.08 +15.082 +15.085 +15.092 +15.09 +15.088 +15.085 +15.08 +15.082 +15.092 +15.095 +15.098 +15.1 +15.098 +15.092 +15.09 +15.078 +15.088 +15.082 +15.078 +15.08 +15.078 +15.075 +15.078 +15.08 +15.075 +15.078 +15.08 +15.07 +15.072 +15.07 +15.065 +15.062 +15.055 +15.052 +15.058 +15.052 +15.055 +15.06 +15.058 +15.055 +15.058 +15.06 +15.062 +15.06 +15.062 +15.06 +15.062 +15.055 +15.058 +15.065 +15.062 +15.067 +15.067 +15.075 +15.072 +15.078 +15.072 +15.075 +15.072 +15.078 +15.075 +15.078 +15.075 +15.078 +15.075 +15.08 +15.085 +15.09 +15.088 +15.082 +15.075 +15.072 +15.067 +15.065 +15.067 +15.072 +15.07 +15.072 +15.075 +15.055 +15.06 +15.065 +15.07 +15.07 +15.067 +15.065 +15.06 +15.062 +15.052 +15.055 +15.05 +15.055 +15.057 +15.06 +15.052 +15.048 +15.05 +15.048 +15.05 +15.055 +15.055 +15.06 +15.062 +15.06 +15.062 +15.072 +15.075 +15.072 +15.07 +15.065 +15.062 +15.065 +15.07 +15.067 +15.06 +15.058 +15.058 +15.055 +15.05 +15.045 +15.05 +15.055 +15.052 +15.055 +15.048 +15.045 +15.042 +15.04 +15.048 +15.05 +15.055 +15.048 +15.042 +15.045 +15.05 +15.058 +15.052 +15.05 +15.048 +15.05 +15.055 +15.058 +15.062 +15.06 +15.065 +15.058 +15.055 +15.058 +15.055 +15.05 +15.04 +15.048 +15.052 +15.052 +15.05 +15.042 +15.045 +15.04 +15.042 +15.038 +15.04 +15.042 +15.05 +15.042 +15.04 +15.038 +15.04 +15.038 +15.035 +15.038 +15.035 +15.032 +15.028 +15.032 +15.028 +15.03 +15.035 +15.032 +15.028 +15.03 +15.032 +15.01 +15.025 +15.01 +15.012 +15.028 +15.015 +15.02 +15.022 +15.03 +15.032 +15.018 +14.997 +15.012 +15.018 +15.015 +15.003 +15.0 +14.998 +15.003 +15.005 +15.008 +15.01 +15.012 +15.015 +15.018 +15.015 +15.018 +15.015 +15.01 +15.015 +15.01 +15.008 +15.005 +15.01 +15.012 +15.015 +15.008 +15.002 +15.0 +15.002 +14.988 +14.985 +14.977 +14.98 +14.982 +14.985 +14.99 +14.992 +14.99 +14.992 +14.995 +14.997 +15.0 +14.998 +15.0 +15.003 +15.0 +14.992 +14.988 +14.99 +14.988 +14.985 +14.977 +14.97 +14.977 +14.98 +14.982 +14.98 +14.977 +14.98 +14.985 +14.98 +14.988 +14.99 +14.992 +14.995 +15.0 +14.995 +14.997 +14.998 +15.0 diff --git a/data/samples/stocks_uk.csv b/data/samples/stocks_uk.csv new file mode 100644 index 0000000..da7ad66 --- /dev/null +++ b/data/samples/stocks_uk.csv @@ -0,0 +1,1024 @@ +1723.0 +1722.0 +1721.0 +1720.0 +1719.0 +1717.5 +1715.5 +1713.5 +1712.5 +1712.0 +1711.0 +1709.5 +1708.5 +1713.0 +1712.0 +1710.5 +1711.0 +1711.5 +1712.0 +1712.5 +1712.0 +1712.5 +1712.0 +1712.5 +1712.0 +1712.5 +1710.5 +1710.0 +1709.5 +1710.0 +1709.5 +1710.0 +1709.5 +1710.0 +1709.5 +1710.0 +1701.5 +1710.0 +1701.5 +1702.0 +1701.5 +1702.0 +1701.5 +1702.0 +1701.5 +1702.0 +1701.5 +1702.0 +1701.5 +1702.0 +1701.5 +1702.0 +1701.5 +1702.0 +1701.5 +1702.0 +1701.5 +1702.0 +1701.5 +1702.0 +1701.5 +1702.0 +1701.5 +1702.0 +1701.5 +1702.0 +1701.5 +1702.0 +1701.5 +1702.0 +1701.5 +1702.0 +1701.5 +1702.0 +1701.5 +1702.0 +1701.5 +1702.0 +1701.5 +1702.0 +1701.5 +1701.0 +1702.0 +1701.5 +1702.0 +1701.5 +1702.0 +1701.5 +1702.0 +1701.5 +1702.0 +1701.5 +1702.0 +1701.5 +1702.0 +1699.0 +1701.5 +1702.0 +1707.5 +1701.5 +1707.5 +1706.0 +1707.5 +1701.5 +1707.5 +1706.0 +1707.5 +1701.5 +1702.0 +1701.5 +1702.0 +1701.0 +1699.5 +1698.5 +1696.5 +1696.0 +1696.5 +1696.0 +1696.5 +1696.0 +1696.5 +1696.0 +1696.5 +1696.0 +1696.5 +1696.0 +1696.5 +1696.0 +1696.5 +1695.0 +1697.0 +1697.5 +1698.0 +1700.5 +1694.5 +1703.0 +1697.5 +1698.0 +1701.5 +1697.5 +1705.5 +1701.5 +1704.5 +1706.0 +1705.0 +1705.5 +1707.0 +1706.5 +1707.0 +1705.5 +1705.0 +1705.5 +1707.0 +1706.5 +1707.0 +1708.5 +1705.5 +1706.0 +1707.5 +1710.0 +1706.5 +1707.0 +1706.5 +1704.5 +1703.5 +1704.0 +1705.5 +1705.0 +1705.5 +1709.0 +1705.5 +1706.0 +1705.5 +1706.0 +1705.5 +1706.0 +1706.5 +1706.0 +1706.5 +1706.0 +1706.5 +1706.0 +1706.5 +1706.0 +1706.5 +1706.0 +1706.5 +1706.0 +1706.5 +1706.0 +1706.5 +1706.0 +1706.5 +1706.0 +1706.5 +1706.0 +1706.5 +1706.0 +1706.5 +1706.0 +1706.5 +1706.0 +1706.5 +1706.0 +1706.5 +1706.0 +1706.5 +1706.0 +1706.5 +1706.0 +1704.5 +1706.0 +1704.5 +1705.0 +1706.5 +1706.0 +1706.5 +1706.0 +1706.5 +1706.0 +1706.5 +1706.0 +1706.5 +1706.0 +1706.5 +1710.0 +1706.0 +1709.5 +1708.0 +1706.0 +1706.5 +1706.0 +1706.5 +1707.0 +1706.5 +1706.0 +1706.5 +1706.0 +1706.5 +1706.0 +1706.5 +1706.0 +1706.5 +1706.0 +1706.5 +1706.0 +1706.5 +1710.0 +1708.5 +1706.5 +1707.0 +1706.5 +1707.0 +1706.5 +1707.0 +1706.0 +1706.5 +1706.0 +1706.5 +1706.0 +1706.5 +1706.0 +1706.5 +1706.0 +1706.5 +1706.0 +1706.5 +1706.0 +1706.5 +1706.0 +1706.5 +1706.0 +1706.5 +1708.5 +1706.0 +1706.5 +1706.0 +1706.5 +1706.0 +1706.5 +1706.0 +1706.5 +1706.0 +1706.5 +1710.0 +1708.5 +1706.0 +1706.5 +1706.0 +1706.5 +1706.0 +1706.5 +1706.0 +1706.5 +1706.0 +1706.5 +1706.0 +1706.5 +1706.0 +1705.5 +1706.0 +1705.5 +1706.0 +1705.5 +1706.0 +1705.5 +1706.0 +1706.5 +1706.0 +1704.5 +1706.5 +1703.5 +1704.0 +1706.5 +1705.5 +1705.0 +1705.0 +1705.5 +1705.0 +1704.5 +1705.0 +1703.5 +1705.0 +1704.5 +1705.0 +1704.5 +1704.0 +1704.5 +1702.5 +1704.0 +1704.0 +1704.5 +1705.0 +1708.0 +1706.5 +1702.0 +1704.5 +1702.5 +1703.0 +1702.5 +1702.0 +1702.5 +1703.0 +1702.5 +1702.0 +1699.5 +1703.0 +1699.5 +1703.0 +1702.5 +1702.0 +1700.5 +1698.0 +1698.5 +1700.0 +1700.5 +1700.0 +1697.0 +1700.5 +1699.5 +1700.0 +1699.5 +1699.0 +1699.5 +1697.0 +1699.5 +1699.0 +1699.5 +1700.0 +1699.5 +1699.0 +1699.5 +1700.0 +1699.5 +1699.0 +1699.5 +1700.0 +1699.5 +1700.0 +1699.5 +1699.0 +1700.0 +1699.5 +1699.0 +1699.5 +1700.0 +1699.5 +1702.5 +1698.0 +1698.5 +1697.0 +1697.5 +1697.0 +1697.5 +1697.0 +1697.5 +1697.0 +1697.5 +1699.0 +1698.5 +1699.0 +1702.0 +1699.0 +1700.5 +1698.5 +1699.0 +1698.5 +1699.0 +1698.5 +1699.0 +1698.5 +1699.0 +1698.5 +1699.0 +1698.5 +1699.0 +1699.5 +1699.0 +1699.5 +1699.0 +1699.5 +1699.0 +1699.5 +1699.0 +1699.5 +1699.0 +1699.5 +1699.0 +1699.5 +1699.0 +1699.5 +1699.0 +1699.5 +1698.5 +1702.0 +1698.5 +1699.0 +1699.5 +1699.0 +1699.5 +1699.0 +1699.5 +1699.0 +1699.5 +1700.0 +1699.5 +1699.0 +1697.5 +1698.0 +1697.5 +1698.0 +1699.0 +1700.5 +1701.5 +1701.0 +1699.5 +1701.0 +1701.5 +1701.0 +1699.0 +1701.5 +1701.0 +1698.5 +1695.5 +1697.0 +1696.0 +1696.5 +1697.5 +1697.0 +1701.0 +1701.0 +1697.0 +1695.0 +1694.5 +1696.0 +1697.5 +1700.0 +1697.5 +1700.0 +1699.5 +1700.0 +1697.5 +1699.0 +1697.0 +1700.0 +1700.5 +1701.5 +1698.5 +1701.0 +1699.5 +1702.5 +1703.0 +1704.0 +1703.5 +1703.5 +1704.0 +1703.5 +1703.0 +1703.5 +1703.0 +1702.5 +1703.0 +1702.5 +1702.0 +1702.5 +1703.0 +1702.5 +1702.0 +1702.5 +1703.0 +1702.5 +1702.0 +1702.5 +1703.0 +1704.0 +1705.5 +1705.0 +1703.5 +1704.5 +1704.0 +1703.5 +1704.0 +1703.5 +1703.0 +1703.5 +1704.0 +1704.5 +1705.0 +1704.5 +1703.0 +1704.5 +1702.5 +1703.0 +1705.0 +1704.5 +1703.5 +1705.0 +1705.5 +1705.0 +1705.5 +1705.0 +1704.5 +1705.0 +1705.5 +1705.0 +1705.5 +1705.0 +1705.5 +1705.0 +1705.5 +1705.0 +1705.5 +1705.0 +1705.5 +1705.0 +1705.5 +1705.0 +1705.5 +1705.0 +1705.5 +1705.0 +1705.5 +1706.0 +1705.0 +1705.5 +1705.0 +1705.5 +1705.0 +1705.5 +1705.0 +1705.5 +1706.0 +1705.0 +1705.5 +1705.0 +1706.0 +1706.5 +1707.0 +1706.5 +1707.0 +1706.5 +1707.0 +1706.5 +1707.0 +1705.0 +1704.5 +1705.0 +1708.5 +1707.5 +1708.5 +1709.0 +1708.0 +1708.5 +1708.0 +1711.0 +1708.5 +1709.0 +1708.5 +1709.0 +1708.5 +1709.0 +1708.5 +1709.0 +1708.5 +1709.0 +1708.5 +1709.0 +1708.5 +1709.0 +1708.5 +1709.0 +1708.5 +1709.0 +1708.5 +1708.0 +1708.5 +1709.0 +1708.5 +1709.0 +1709.5 +1710.0 +1709.5 +1712.0 +1710.0 +1709.5 +1710.0 +1709.5 +1710.0 +1709.5 +1710.0 +1709.5 +1710.0 +1709.5 +1710.0 +1712.5 +1711.5 +1710.5 +1711.0 +1713.0 +1710.5 +1711.0 +1714.0 +1715.5 +1711.5 +1716.5 +1713.0 +1716.5 +1716.0 +1712.5 +1713.5 +1713.0 +1714.5 +1716.5 +1713.5 +1714.0 +1713.5 +1714.0 +1713.5 +1714.0 +1713.5 +1714.0 +1713.5 +1714.0 +1713.5 +1714.0 +1713.5 +1713.0 +1717.0 +1713.5 +1714.0 +1714.5 +1713.5 +1714.0 +1714.5 +1715.0 +1717.0 +1716.0 +1721.0 +1719.5 +1715.5 +1717.0 +1718.5 +1721.0 +1721.0 +1717.5 +1719.0 +1722.0 +1718.0 +1718.5 +1721.0 +1718.5 +1721.0 +1722.0 +1719.5 +1725.5 +1723.5 +1723.0 +1724.0 +1722.5 +1721.5 +1722.0 +1721.5 +1722.0 +1721.0 +1719.5 +1722.5 +1721.0 +1719.5 +1722.5 +1723.5 +1721.0 +1720.5 +1721.0 +1720.5 +1721.0 +1720.0 +1721.0 +1725.0 +1722.0 +1721.5 +1725.0 +1724.5 +1726.5 +1723.0 +1721.5 +1722.5 +1722.0 +1723.5 +1722.0 +1723.5 +1722.5 +1725.0 +1724.0 +1722.0 +1723.5 +1726.5 +1725.0 +1723.5 +1722.0 +1723.5 +1725.0 +1722.0 +1723.5 +1723.0 +1725.5 +1726.5 +1727.5 +1724.5 +1724.0 +1724.5 +1723.5 +1724.0 +1723.0 +1723.0 +1724.0 +1722.0 +1722.5 +1722.0 +1723.0 +1723.5 +1724.0 +1723.0 +1723.5 +1723.0 +1725.5 +1727.0 +1727.0 +1727.0 +1724.5 +1728.0 +1725.0 +1724.0 +1727.5 +1724.0 +1724.5 +1728.0 +1726.0 +1728.5 +1726.5 +1729.0 +1728.0 +1726.5 +1728.5 +1730.0 +1731.0 +1730.5 +1727.5 +1730.0 +1726.5 +1726.0 +1726.5 +1726.0 +1726.5 +1725.5 +1728.5 +1728.5 +1727.0 +1727.5 +1726.0 +1727.5 +1726.0 +1727.5 +1726.0 +1725.5 +1724.5 +1725.0 +1722.5 +1720.0 +1719.5 +1720.5 +1723.5 +1722.5 +1723.5 +1724.5 +1727.5 +1724.5 +1724.0 +1724.5 +1726.5 +1723.5 +1724.5 +1723.0 +1724.5 +1726.0 +1722.5 +1720.5 +1723.5 +1724.5 +1723.5 +1726.0 +1722.0 +1726.0 +1724.5 +1726.5 +1726.0 +1729.0 +1727.5 +1725.0 +1725.5 +1726.0 +1725.5 +1726.0 +1725.5 +1726.0 +1726.5 +1726.0 +1725.5 +1726.0 +1725.5 +1726.0 +1725.5 +1727.5 +1726.0 +1725.5 +1725.0 +1725.5 +1725.0 +1724.5 +1725.0 +1724.5 +1725.5 +1724.5 +1723.0 +1721.0 +1722.0 +1721.0 +1721.5 +1721.0 +1721.0 +1720.5 +1721.0 +1720.5 +1721.0 +1720.5 +1721.0 +1721.5 +1721.0 +1721.5 +1721.0 +1721.5 +1723.0 +1720.0 +1720.5 +1721.0 +1721.5 +1721.0 +1721.5 +1721.0 +1721.5 +1721.0 +1721.5 +1721.0 +1721.5 +1721.0 +1721.5 +1723.0 +1721.5 +1723.0 +1725.5 +1724.5 +1728.5 +1724.0 +1724.5 +1725.0 +1724.5 +1725.0 +1728.0 +1728.0 +1724.5 +1726.5 +1726.0 +1724.5 +1724.0 +1724.5 +1725.5 +1727.5 +1726.5 +1727.0 +1728.5 +1727.0 +1727.5 +1727.0 +1727.5 +1727.0 +1727.5 +1729.0 +1726.5 +1725.0 +1724.0 +1724.5 +1724.0 +1724.0 +1723.5 +1723.0 +1723.5 +1723.0 +1723.5 +1723.0 +1725.0 +1725.5 +1725.0 +1725.0 +1725.5 +1726.5 +1724.0 +1724.5 +1725.0 +1727.0 +1725.0 +1725.5 +1725.0 +1726.5 +1725.0 +1725.5 +1724.0 +1724.5 +1725.0 +1724.5 +1725.0 +1726.5 +1725.5 +1725.0 +1724.0 +1724.5 +1726.0 +1724.5 +1725.0 +1724.5 +1725.0 +1724.5 +1725.0 +1724.5 +1725.0 +1724.5 +1725.0 +1724.5 +1725.0 +1724.5 +1725.0 +1724.5 +1725.0 +1724.5 +1725.0 +1726.5 +1727.5 +1727.0 +1726.0 +1727.5 +1724.5 +1729.5 +1725.0 +1729.0 +1728.5 +1727.0 +1727.5 +1729.0 +1728.5 +1729.0 +1728.5 +1729.0 +1728.5 +1729.0 +1728.5 +1728.0 +1728.5 +1729.0 +1730.5 +1729.5 +1730.0 +1730.5 +1731.5 +1730.5 +1731.0 +1731.5 +1731.0 +1731.5 +1731.0 +1730.5 +1731.0 +1732.5 +1731.0 +1731.5 +1731.0 +1731.5 diff --git a/data/samples/stocks_usa_c.csv b/data/samples/stocks_usa_c.csv new file mode 100644 index 0000000..7fdc400 --- /dev/null +++ b/data/samples/stocks_usa_c.csv @@ -0,0 +1,1024 @@ +22.49 +22.5 +22.49 +22.5 +21.96 +21.96 +21.96 +21.96 +21.96 +21.96 +21.96 +21.96 +21.96 +21.96 +21.96 +21.96 +21.96 +21.96 +21.99 +21.98 +21.99 +21.98 +21.97 +21.98 +21.97 +21.96 +21.98 +21.96 +21.98 +21.99 +21.99 +21.99 +22.0 +21.99 +22.0 +21.99 +22.0 +21.99 +21.98 +21.97 +21.94 +21.96 +21.94 +21.94 +21.94 +21.94 +21.94 +21.96 +21.96 +21.95 +21.96 +21.96 +21.96 +21.95 +21.96 +21.95 +21.95 +21.94 +21.96 +21.96 +21.96 +21.96 +21.96 +21.96 +21.96 +21.96 +21.94 +21.92 +21.92 +21.92 +21.92 +21.92 +21.9 +21.92 +21.92 +21.92 +21.9 +21.88 +21.88 +21.89 +21.88 +21.89 +21.9 +21.91 +21.9 +21.9 +21.9 +21.91 +21.92 +21.88 +21.89 +21.87 +21.88 +21.88 +21.89 +21.9 +21.91 +21.92 +21.9 +21.91 +21.92 +21.92 +21.92 +21.92 +21.93 +21.92 +21.92 +21.92 +21.9 +21.9 +21.9 +21.9 +21.91 +21.9 +21.89 +21.9 +21.88 +21.88 +21.88 +21.88 +21.86 +21.85 +21.86 +21.86 +21.86 +21.86 +21.86 +21.85 +21.86 +21.85 +21.84 +21.84 +21.81 +21.82 +21.83 +21.83 +21.84 +21.84 +21.84 +21.85 +21.85 +21.84 +21.83 +21.81 +21.81 +21.81 +21.81 +21.82 +21.83 +21.84 +21.84 +21.86 +21.86 +21.86 +21.86 +21.87 +21.86 +21.88 +21.87 +21.86 +21.88 +21.87 +21.86 +21.88 +21.87 +21.88 +21.87 +21.85 +21.85 +21.85 +21.86 +21.85 +21.86 +21.84 +21.85 +21.86 +21.86 +21.85 +21.84 +21.85 +21.85 +21.85 +21.84 +21.85 +21.85 +21.85 +21.86 +21.85 +21.86 +21.87 +21.88 +21.86 +21.86 +21.84 +21.84 +21.84 +21.84 +21.84 +21.84 +21.84 +21.84 +21.84 +21.84 +21.84 +21.84 +21.84 +21.84 +21.84 +21.84 +21.81 +21.82 +21.81 +21.82 +21.81 +21.8 +21.81 +21.8 +21.8 +21.8 +21.8 +21.8 +21.78 +21.78 +21.78 +21.78 +21.78 +21.78 +21.76 +21.78 +21.77 +21.78 +21.78 +21.78 +21.76 +21.77 +21.78 +21.77 +21.76 +21.76 +21.76 +21.76 +21.76 +21.77 +21.78 +21.78 +21.8 +21.81 +21.82 +21.81 +21.82 +21.81 +21.82 +21.84 +21.85 +21.86 +21.87 +21.88 +21.87 +21.88 +21.88 +21.86 +21.86 +21.86 +21.85 +21.84 +21.84 +21.84 +21.84 +21.84 +21.84 +21.84 +21.83 +21.84 +21.84 +21.84 +21.85 +21.84 +21.85 +21.84 +21.85 +21.86 +21.85 +21.84 +21.84 +21.84 +21.85 +21.85 +21.86 +21.87 +21.88 +21.88 +21.88 +21.89 +21.88 +21.9 +21.88 +21.9 +21.88 +21.88 +21.86 +21.88 +21.88 +21.88 +21.88 +21.88 +21.86 +21.88 +21.89 +21.9 +21.9 +21.92 +21.92 +21.92 +21.92 +21.91 +21.9 +21.92 +21.92 +21.92 +21.91 +21.92 +21.9 +21.91 +21.9 +21.9 +21.92 +21.91 +21.9 +21.92 +21.92 +21.92 +21.92 +21.91 +21.92 +21.9 +21.91 +21.92 +21.92 +21.93 +21.94 +21.93 +21.94 +21.93 +21.94 +21.94 +21.94 +21.93 +21.92 +21.93 +21.94 +21.93 +21.93 +21.94 +21.93 +21.94 +21.93 +21.92 +21.94 +21.93 +21.94 +21.94 +21.94 +21.95 +21.96 +21.95 +21.96 +21.95 +21.94 +21.95 +21.96 +21.96 +21.96 +21.97 +21.97 +21.98 +21.98 +21.98 +21.98 +21.98 +21.98 +21.98 +21.99 +21.98 +21.99 +21.98 +21.99 +21.99 +22.0 +21.99 +21.98 +21.99 +22.0 +21.99 +22.0 +22.0 +21.99 +22.0 +21.99 +21.99 +21.98 +21.96 +21.96 +21.96 +21.96 +21.96 +21.95 +21.96 +21.94 +21.94 +21.94 +21.94 +21.93 +21.92 +21.92 +21.92 +21.92 +21.92 +21.92 +21.92 +21.92 +21.92 +21.94 +21.94 +21.93 +21.94 +21.93 +21.92 +21.94 +21.93 +21.92 +21.92 +21.91 +21.92 +21.93 +21.94 +21.92 +21.93 +21.94 +21.93 +21.92 +21.92 +21.92 +21.92 +21.92 +21.92 +21.92 +21.92 +21.93 +21.94 +21.94 +21.95 +21.96 +21.96 +21.96 +21.96 +21.97 +21.98 +21.98 +21.99 +21.99 +21.99 +22.0 +22.0 +22.0 +21.99 +21.99 +21.98 +21.97 +21.98 +21.96 +21.97 +21.96 +21.98 +21.96 +21.96 +21.96 +21.96 +21.96 +21.96 +21.96 +21.95 +21.94 +21.94 +21.94 +21.94 +21.94 +21.94 +21.93 +21.94 +21.96 +21.96 +21.96 +21.96 +21.98 +21.98 +21.98 +21.98 +21.98 +21.98 +21.98 +21.98 +21.97 +21.96 +21.96 +21.96 +21.96 +21.96 +21.98 +21.98 +21.99 +21.99 +21.99 +21.99 +21.99 +21.99 +22.01 +22.0 +22.02 +22.02 +22.02 +22.01 +22.02 +22.01 +21.99 +21.99 +21.98 +21.98 +21.98 +21.98 +21.97 +21.96 +21.96 +21.96 +21.96 +21.97 +21.96 +21.97 +21.96 +21.96 +21.96 +21.95 +21.94 +21.93 +21.92 +21.93 +21.93 +21.94 +21.93 +21.92 +21.92 +21.94 +21.94 +21.93 +21.92 +21.92 +21.94 +21.92 +21.94 +21.93 +21.94 +21.93 +21.92 +21.93 +21.94 +21.94 +21.94 +21.94 +21.94 +21.93 +21.94 +21.93 +21.94 +21.93 +21.94 +21.92 +21.92 +21.92 +21.92 +21.92 +21.92 +21.92 +21.92 +21.92 +21.92 +21.92 +21.92 +21.92 +21.92 +21.92 +21.92 +21.92 +21.92 +21.93 +21.92 +21.93 +21.94 +21.92 +21.92 +21.92 +21.91 +21.9 +21.92 +21.92 +21.92 +21.92 +21.93 +21.92 +21.94 +21.94 +21.94 +21.94 +21.94 +21.93 +21.94 +21.92 +21.93 +21.94 +21.94 +21.94 +21.94 +21.94 +21.94 +21.94 +21.94 +21.94 +21.94 +21.93 +21.92 +21.94 +21.93 +21.92 +21.94 +21.94 +21.94 +21.94 +21.94 +21.95 +21.96 +21.95 +21.96 +21.96 +21.96 +21.96 +21.96 +21.96 +21.96 +21.96 +21.96 +21.96 +21.97 +21.96 +21.97 +21.98 +21.99 +21.98 +21.98 +21.98 +21.98 +21.98 +21.98 +21.98 +21.98 +21.98 +21.98 +21.98 +21.98 +21.98 +21.98 +21.98 +21.98 +21.98 +21.97 +21.98 +21.98 +21.98 +21.96 +21.96 +21.96 +21.96 +21.98 +21.98 +21.98 +21.99 +21.99 +22.0 +21.99 +22.0 +22.0 +22.02 +22.01 +22.02 +22.02 +22.02 +22.02 +22.03 +22.04 +22.04 +22.02 +22.03 +22.01 +22.02 +22.01 +22.02 +22.02 +22.03 +22.02 +22.02 +22.03 +22.03 +22.02 +22.04 +22.03 +22.04 +22.04 +22.04 +22.04 +22.03 +22.02 +22.03 +22.02 +22.03 +22.02 +22.03 +22.02 +22.02 +22.02 +22.02 +22.03 +22.04 +22.04 +22.04 +22.05 +22.06 +22.06 +22.06 +22.07 +22.06 +22.06 +22.06 +22.07 +22.06 +22.05 +22.06 +22.06 +22.06 +22.07 +22.08 +22.07 +22.08 +22.07 +22.06 +22.06 +22.06 +22.05 +22.04 +22.04 +22.03 +22.02 +22.02 +22.02 +22.03 +22.04 +22.04 +22.04 +22.04 +22.04 +22.04 +22.04 +22.05 +22.06 +22.06 +22.06 +22.06 +22.07 +22.06 +22.07 +22.06 +22.07 +22.06 +22.07 +22.06 +22.07 +22.06 +22.07 +22.06 +22.08 +22.08 +22.08 +22.06 +22.07 +22.08 +22.07 +22.06 +22.07 +22.08 +22.08 +22.08 +22.06 +22.07 +22.08 +22.07 +22.08 +22.08 +22.08 +22.08 +22.09 +22.1 +22.1 +22.1 +22.11 +22.1 +22.11 +22.1 +22.1 +22.11 +22.12 +22.12 +22.12 +22.12 +22.12 +22.11 +22.12 +22.11 +22.12 +22.11 +22.12 +22.12 +22.12 +22.13 +22.12 +22.11 +22.13 +22.12 +22.12 +22.12 +22.12 +22.12 +22.1 +22.1 +22.1 +22.12 +22.11 +22.12 +22.12 +22.12 +22.12 +22.11 +22.12 +22.12 +22.12 +22.12 +22.12 +22.13 +22.13 +22.13 +22.12 +22.12 +22.13 +22.12 +22.13 +22.12 +22.13 +22.12 +22.13 +22.12 +22.13 +22.12 +22.13 +22.14 +22.15 +22.16 +22.16 +22.15 +22.16 +22.16 +22.16 +22.16 +22.16 +22.16 +22.16 +22.16 +22.16 +22.16 +22.16 +22.16 +22.16 +22.16 +22.16 +22.15 +22.16 +22.16 +22.16 +22.15 +22.16 +22.16 +22.16 +22.15 +22.15 +22.15 +22.16 +22.15 +22.16 +22.15 +22.16 +22.16 +22.16 +22.16 +22.16 +22.16 +22.16 +22.16 +22.16 +22.16 +22.16 +22.16 +22.16 +22.16 +22.16 +22.16 +22.16 +22.15 +22.16 +22.16 +22.15 +22.16 +22.15 +22.15 +22.15 +22.15 +22.16 +22.16 +22.16 +22.16 +22.16 +22.16 +22.15 +22.15 +22.15 +22.14 +22.12 +22.12 +22.11 +22.1 +22.1 +22.11 +22.1 +22.11 +22.1 +22.11 +22.1 +22.1 +22.09 +22.1 +22.09 +22.1 +22.1 +22.08 +22.08 +22.08 +22.08 +22.08 +22.07 +22.07 +22.06 +22.06 +22.06 +22.06 +22.02 +22.03 +22.04 +22.04 +22.03 +22.04 +22.04 +22.04 +22.04 +22.04 +22.04 +22.04 +22.05 +22.06 +22.05 +22.04 +22.05 +22.04 +22.06 +22.05 +22.05 +22.06 +22.04 +22.06 +22.05 +22.06 +22.06 +22.06 +22.06 +22.06 +22.06 +22.06 +22.05 +22.04 +22.04 +22.04 +22.04 +22.04 +22.04 +22.04 +22.04 +22.04 +22.04 +22.04 +22.03 +22.04 +22.03 +22.04 +22.03 +22.02 +22.02 +22.02 +22.02 +22.02 +22.02 +22.03 +22.04 diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt new file mode 100644 index 0000000..a2da718 --- /dev/null +++ b/example/CMakeLists.txt @@ -0,0 +1,14 @@ +add_executable(simple_compress simple_compress.cpp) +target_link_libraries(simple_compress PRIVATE ALP) + +add_executable(simple_compress32 simple_compress32.cpp) +target_link_libraries(simple_compress32 PRIVATE ALP) + +add_executable(rd_compress rd_compress.cpp) +target_link_libraries(rd_compress PRIVATE ALP) + +add_executable(adaptive_compress adaptive_compress.cpp) +target_link_libraries(adaptive_compress PRIVATE ALP) + +add_executable(rd_compress32 rd_compress32.cpp) +target_link_libraries(rd_compress32 PRIVATE ALP) diff --git a/example/adaptive_compress.cpp b/example/adaptive_compress.cpp new file mode 100644 index 0000000..f9c1997 --- /dev/null +++ b/example/adaptive_compress.cpp @@ -0,0 +1,48 @@ +#include "alp.hpp" +#include "helper.hpp" +#include + +int main() { + size_t tuples_count = 1024 * 200; + size_t out_buffer_size = + (tuples_count * sizeof(double)) + 8096; // We leave some headroom in case of negative compression + size_t uncompressed_size = tuples_count * sizeof(double); + + double in[tuples_count]; + uint8_t out[out_buffer_size]; + example::fill_random_data(in, tuples_count / 2, 2); // Half of the data has a limited precision + example::fill_random_data( + (in + (tuples_count / 2)), tuples_count / 2, 18); // The other half of it has a random precision + + /* + * Compress + */ + alp::AlpCompressor compressor = alp::AlpCompressor(); + printf("Compressing with ALP...\n"); + compressor.compress(in, tuples_count, out); + size_t compressed_size = compressor.get_size(); + double compression_ratio = (double)uncompressed_size / compressed_size; + printf("Uncompressed size (in bytes): %zu\n", uncompressed_size); + printf("Compressed size (in bytes): %zu\n", compressed_size); + printf("Compression Ratio: %f\n\n", compression_ratio); + + /* + * Decompress + */ + size_t decompressed_buffer_size = + alp::AlpApiUtils::align_value(tuples_count); + double decompressed[decompressed_buffer_size]; + alp::AlpDecompressor decompressor = alp::AlpDecompressor(); + printf("Decompressing with ALP...\n"); + decompressor.decompress(out, tuples_count, decompressed); + + /* + * Validity Test + */ + for (size_t i = 0; i < tuples_count; i++) { + assert(in[i] == decompressed[i]); + } + printf("OK\n"); + + return 0; +} diff --git a/example/include/helper.hpp b/example/include/helper.hpp new file mode 100644 index 0000000..c09e808 --- /dev/null +++ b/example/include/helper.hpp @@ -0,0 +1,25 @@ +#ifndef EXAMPLE_HELPER_HPP +#define EXAMPLE_HELPER_HPP + +#include +#include + +namespace example { + +template +inline void fill_random_data(T* in, size_t tuples_to_generate, uint8_t precision) { + std::uniform_real_distribution unif(100, 300); + std::default_random_engine re; + re.seed(42); + uint8_t doubles_intrinsic_precision = precision; + const T precision_multiplier = std::pow(10.0, doubles_intrinsic_precision); + for (size_t i = 0; i < tuples_to_generate; i++) { + T random_value = unif(re); + T fixed_precision_random_value = std::round(random_value * precision_multiplier) / precision_multiplier; + in[i] = fixed_precision_random_value; + } +} + +} // namespace example + +#endif // EXAMPLE_HELPER_HPP diff --git a/example/rd_compress.cpp b/example/rd_compress.cpp new file mode 100644 index 0000000..22b97a7 --- /dev/null +++ b/example/rd_compress.cpp @@ -0,0 +1,46 @@ +#include "alp.hpp" +#include "helper.hpp" +#include + +int main() { + size_t tuples_count = 1024 * 100; + size_t out_buffer_size = + (tuples_count * sizeof(double)) + 8096; // We leave some headroom in case of negative compression + size_t uncompressed_size = tuples_count * sizeof(double); + + double in[tuples_count]; + uint8_t out[out_buffer_size]; + example::fill_random_data(in, tuples_count, 20); + + /* + * If you know your doubles will have a random precision, you can directly use compress_rd + */ + alp::AlpCompressor compressor = alp::AlpCompressor(); + printf("Compressing with ALPRD...\n"); + compressor.compress_rd(in, tuples_count, out); + size_t compressed_size = compressor.get_size(); + double compression_ratio = (double)uncompressed_size / compressed_size; + printf("Uncompressed size (in bytes): %zu\n", uncompressed_size); + printf("Compressed size (in bytes): %zu\n", compressed_size); + printf("Compression Ratio: %f\n\n", compression_ratio); + + /* + * Decompress + */ + auto decompressed_buffer_size = + alp::AlpApiUtils::align_value(tuples_count); + double decompressed[decompressed_buffer_size]; + auto decompressor = alp::AlpDecompressor(); + printf("Decompressing with ALPRD...\n"); + decompressor.decompress(out, tuples_count, decompressed); + + /* + * Validity Test + */ + for (size_t i = 0; i < tuples_count; i++) { + assert(in[i] == decompressed[i]); + } + printf("OK\n"); + + return 0; +} diff --git a/example/rd_compress32.cpp b/example/rd_compress32.cpp new file mode 100644 index 0000000..111c7fb --- /dev/null +++ b/example/rd_compress32.cpp @@ -0,0 +1,46 @@ +#include "alp.hpp" +#include "helper.hpp" +#include + +int main() { + size_t tuples_count = (1024 * 100); + size_t out_buffer_size = + (tuples_count * sizeof(float)) + 8096; // We leave some headroom in case of negative compression + size_t uncompressed_size = tuples_count * sizeof(float); + + float in[tuples_count]; + uint8_t out[out_buffer_size]; + example::fill_random_data(in, tuples_count, 20); + + /* + * Compress + */ + alp::AlpCompressor compressor = alp::AlpCompressor(); + printf("Compressing with ALP...\n"); + compressor.compress_rd(in, tuples_count, out); + size_t compressed_size = compressor.get_size(); + double compression_ratio = (double)uncompressed_size / compressed_size; + printf("Uncompressed size (in bytes): %zu\n", uncompressed_size); + printf("Compressed size (in bytes): %zu\n", compressed_size); + printf("Compression Ratio: %f\n\n", compression_ratio); + + /* + * Decompress + */ + size_t decompressed_buffer_size = + alp::AlpApiUtils::align_value(tuples_count); + float decompressed[decompressed_buffer_size]; + alp::AlpDecompressor decompressor = alp::AlpDecompressor(); + printf("Decompressing with ALP...\n"); + decompressor.decompress(out, tuples_count, decompressed); + + /* + * Validity Test + */ + for (size_t i = 0; i < tuples_count; i++) { + assert(in[i] == decompressed[i]); + } + printf("OK\n"); + + return 0; +} diff --git a/example/simple_compress.cpp b/example/simple_compress.cpp new file mode 100644 index 0000000..e680f7f --- /dev/null +++ b/example/simple_compress.cpp @@ -0,0 +1,46 @@ +#include "alp.hpp" +#include "helper.hpp" +#include + +int main() { + size_t tuples_count = (1024 * 100); + size_t out_buffer_size = + (tuples_count * sizeof(double)) + 8096; // We leave some headroom in case of negative compression + size_t uncompressed_size = tuples_count * sizeof(double); + + double in[tuples_count]; + uint8_t out[out_buffer_size]; + example::fill_random_data(in, tuples_count, 2); + + /* + * Compress + */ + alp::AlpCompressor compressor = alp::AlpCompressor(); + printf("Compressing with ALP...\n"); + compressor.compress(in, tuples_count, out); + size_t compressed_size = compressor.get_size(); + double compression_ratio = (double)uncompressed_size / compressed_size; + printf("Uncompressed size (in bytes): %zu\n", uncompressed_size); + printf("Compressed size (in bytes): %zu\n", compressed_size); + printf("Compression Ratio: %f\n\n", compression_ratio); + + /* + * Decompress + */ + size_t decompressed_buffer_size = + alp::AlpApiUtils::align_value(tuples_count); + double decompressed[decompressed_buffer_size]; + alp::AlpDecompressor decompressor = alp::AlpDecompressor(); + printf("Decompressing with ALP...\n"); + decompressor.decompress(out, tuples_count, decompressed); + + /* + * Validity Test + */ + for (size_t i = 0; i < tuples_count; i++) { + assert(in[i] == decompressed[i]); + } + printf("OK\n"); + + return 0; +} diff --git a/example/simple_compress32.cpp b/example/simple_compress32.cpp new file mode 100644 index 0000000..140ae1c --- /dev/null +++ b/example/simple_compress32.cpp @@ -0,0 +1,46 @@ +#include "alp.hpp" +#include "helper.hpp" +#include + +int main() { + size_t tuples_count = (1024 * 100); + size_t out_buffer_size = + (tuples_count * sizeof(float)) + 8096; // We leave some headroom in case of negative compression + size_t uncompressed_size = tuples_count * sizeof(float); + + float in[tuples_count]; + uint8_t out[out_buffer_size]; + example::fill_random_data(in, tuples_count, 1); + + /* + * Compress + */ + alp::AlpCompressor compressor = alp::AlpCompressor(); + printf("Compressing with ALP...\n"); + compressor.compress(in, tuples_count, out); + size_t compressed_size = compressor.get_size(); + double compression_ratio = (double)uncompressed_size / compressed_size; + printf("Uncompressed size (in bytes): %zu\n", uncompressed_size); + printf("Compressed size (in bytes): %zu\n", compressed_size); + printf("Compression Ratio: %f\n\n", compression_ratio); + + /* + * Decompress + */ + size_t decompressed_buffer_size = + alp::AlpApiUtils::align_value(tuples_count); + float decompressed[decompressed_buffer_size]; + alp::AlpDecompressor decompressor = alp::AlpDecompressor(); + printf("Decompressing with ALP...\n"); + decompressor.decompress(out, tuples_count, decompressed); + + /* + * Validity Test + */ + for (size_t i = 0; i < tuples_count; i++) { + assert(in[i] == decompressed[i]); + } + printf("OK\n"); + + return 0; +} diff --git a/generated/CMakeLists.txt b/generated/CMakeLists.txt new file mode 100644 index 0000000..7dc2e8a --- /dev/null +++ b/generated/CMakeLists.txt @@ -0,0 +1,50 @@ +# Describe property +define_property(GLOBAL PROPERTY ALP_ALL_BENCHMARK_TARGETS + BRIEF_DOCS "Global list of elements" + FULL_DOCS "Global list of elements") +# Initialize property +set_property(GLOBAL PROPERTY ALP_ALL_BENCHMARK_TARGETS "") + +# Macro for add values into the list +macro(add_alp_benchmark var) + set_property(GLOBAL APPEND PROPERTY ALP_ALL_BENCHMARK_TARGETS "${var}") +endmacro(add_alp_benchmark) +#----------------------------------------------------------------------------------------------------------------------- #----------------------------------------------------------------------------------------------------------------------- +# always add fallback implementations. not important for benchmarking +add_compile_definitions(ALP_FALLBACK) +add_subdirectory(fallback) +#----------------------------------------------------------------------------------------------------------------------- +if (${CMAKE_SYSTEM_NAME} MATCHES "Emscripten") + set(CMAKE_EXECUTABLE_SUFFIX ".html") + add_subdirectory(wasm) +else () + if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm64" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") + add_subdirectory(arm64v8) + add_compile_definitions(ALP_ARM64V8) + elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "(x86)|(X86)|(amd64)|(AMD64)") +# add_subdirectory(x86_64) + add_compile_definitions(ALP_X86_64) + else () + endif () +endif () +#----------------------------------------------------------------------------------------------------------------------- +add_library(alp_generated SHARED + ${ALP_GENERATED_OBJECT_FILES} + alp_generated.cpp) +#----------------------------------------------------------------------------------------------------------------------- +cmake_print_properties( + TARGETS alp_generated + PROPERTIES COMPILE_DEFINITIONS + PROPERTIES COMPILE_OPTIONS + PROPERTIES COMPILE_FEATURES) +#----------------------------------------------------------------------------------------------------------------------- +get_property(BENCHMARK_LIST GLOBAL PROPERTY ALP_ALL_BENCHMARK_TARGETS) +message(STATUS "BENCHMARK LIST: ${BENCHMARK_LIST}") + +foreach (i IN LISTS BENCHMARK_LIST) + list(APPEND add_custom_target_args COMMAND $) +endforeach () + +add_custom_target(alp_benchmark_all + ${add_custom_target_args} +) \ No newline at end of file diff --git a/generated/alp_generated.cpp b/generated/alp_generated.cpp new file mode 100644 index 0000000..e69de29 diff --git a/generated/arm64v8/CMakeLists.txt b/generated/arm64v8/CMakeLists.txt new file mode 100644 index 0000000..ced5afe --- /dev/null +++ b/generated/arm64v8/CMakeLists.txt @@ -0,0 +1,8 @@ +add_subdirectory(neon_intrinsic_uf1) + +add_library(generated_arm64v8 + OBJECT + arm64v8.cpp) + +SET(ALP_GENERATED_OBJECT_FILES + ${ALP_GENERATED_OBJECT_FILES} $ PARENT_SCOPE) \ No newline at end of file diff --git a/generated/arm64v8/arm64v8.cpp b/generated/arm64v8/arm64v8.cpp new file mode 100644 index 0000000..e69de29 diff --git a/generated/arm64v8/neon_intrinsic_uf1/CMakeLists.txt b/generated/arm64v8/neon_intrinsic_uf1/CMakeLists.txt new file mode 100644 index 0000000..2b9ff8a --- /dev/null +++ b/generated/arm64v8/neon_intrinsic_uf1/CMakeLists.txt @@ -0,0 +1,50 @@ +if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/unpack.cmake") + include(${CMAKE_CURRENT_SOURCE_DIR}/unpack.cmake) +else() +endif() +if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/ut.cmake") + include(${CMAKE_CURRENT_SOURCE_DIR}/ut.cmake) +else() +endif() +if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/normal.cmake") + include(${CMAKE_CURRENT_SOURCE_DIR}/normal.cmake) +else() +endif() +if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/pack.cmake") + include(${CMAKE_CURRENT_SOURCE_DIR}/pack.cmake) +else() +endif() +if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/unffor.cmake") + include(${CMAKE_CURRENT_SOURCE_DIR}/unffor.cmake) +else() +endif() +if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/ffor.cmake") + include(${CMAKE_CURRENT_SOURCE_DIR}/ffor.cmake) +else() +endif() +if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/unrsum.cmake") + include(${CMAKE_CURRENT_SOURCE_DIR}/unrsum.cmake) +else() +endif() +if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/rsum.cmake") + include(${CMAKE_CURRENT_SOURCE_DIR}/rsum.cmake) +else() +endif() +if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/untranspose.cmake") + include(${CMAKE_CURRENT_SOURCE_DIR}/untranspose.cmake) +else() +endif() +if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/transpose.cmake") + include(${CMAKE_CURRENT_SOURCE_DIR}/transpose.cmake) +else() +endif() +if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/rsum_and_untranspose.cmake") + include(${CMAKE_CURRENT_SOURCE_DIR}/rsum_and_untranspose.cmake) +else() +endif() +if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/falp.cmake") + include(${CMAKE_CURRENT_SOURCE_DIR}/falp.cmake) +else() +endif() +set(FLS_GENERATED_OBJECT_FILES + ${FLS_GENERATED_OBJECT_FILES} PARENT_SCOPE) diff --git a/generated/arm64v8/neon_intrinsic_uf1/arm64v8_neon_intrinsic_1024_uf1_falp_bench.cpp b/generated/arm64v8/neon_intrinsic_uf1/arm64v8_neon_intrinsic_1024_uf1_falp_bench.cpp new file mode 100644 index 0000000..303d3cb --- /dev/null +++ b/generated/arm64v8/neon_intrinsic_uf1/arm64v8_neon_intrinsic_1024_uf1_falp_bench.cpp @@ -0,0 +1,130 @@ +#include "arm64v8_neon_intrinsic_1024_uf1_falp_bench.hpp" +#include "alp/alp.hpp" +#include "datasets.hpp" +#include "alp/ffor.hpp" +#include "alp/unffor.hpp" +static __attribute__((noinline)) benchmark::BenchmarkReporter::Run bench_alp_fused_decode(alp_bench::Dataset& dataset, int64_t* ffor_arr, uint8_t bw, int64_t*base_arr,uint8_t factor,uint8_t exponent,double* dec_dbl_arr,double* exc_arr,uint16_t* pos_arr,uint16_t* exc_c_arr) +{ + int benchmark_number = dataset.id; + + #ifdef NDEBUG + uint64_t iterations = 3000000; + #else + uint64_t iterations = 1; + #endif + + std::string benchmark_name = dataset.name + "_fused"; + + uint64_t cycles = benchmark::cycleclock::Now(); + for (uint64_t i = 0; i < iterations; ++i) { + generated::falp::arm64v8::neon::falp(reinterpret_cast(ffor_arr), + dec_dbl_arr, + bw, + reinterpret_cast(base_arr), + factor, + exponent); + alp::AlpDecode::patch_exceptions(dec_dbl_arr, exc_arr, pos_arr, exc_c_arr); + } + + cycles = benchmark::cycleclock::Now() - cycles; + + return benchmark::BenchmarkReporter::Run( + benchmark_number, benchmark_name, iterations, double(cycles) / (double(iterations) * 1024)); +} +static __attribute__((noinline)) benchmark::BenchmarkReporter::Run bench_alp_decode(alp_bench::Dataset& dataset, int64_t* ffor_arr, int64_t* unffor_arr, uint8_t bw, int64_t* base_arr, uint8_t factor, uint8_t exponent, double* dec_dbl_arr, double* exc_arr, uint16_t* pos_arr, uint16_t* exc_c_arr) +{ + + int benchmark_number = dataset.id; + + #ifdef NDEBUG + uint64_t iterations = 3000000; + #else + uint64_t iterations = 1; + #endif + + std::string benchmark_name = dataset.name + ""; + + uint64_t cycles = benchmark::cycleclock::Now(); + for (uint64_t i = 0; i < iterations; ++i) { + alp::generated::unffor::fallback::scalar::unffor(ffor_arr, unffor_arr, bw, base_arr); + alp::AlpDecode(reinterpret_cast(unffor_arr), factor, exponent, dec_dbl_arr); + alp::AlpDecode::patch_exceptions(dec_dbl_arr, exc_arr, pos_arr, exc_c_arr); + } + + cycles = benchmark::cycleclock::Now() - cycles; + + return benchmark::BenchmarkReporter::Run( + benchmark_number, benchmark_name, iterations, double(cycles) / (double(iterations) * 1024)); + +} +void benchmark_all(benchmark::Benchmark& benchmark) +{ + + double* dbl_arr; + double* exc_arr; + uint16_t* pos_arr; + uint16_t* exc_c_arr; + int64_t* ffor_arr; + int64_t* unffor_arr; + + int64_t* base_arr; + int64_t* dig_arr; + double* dec_dbl_arr; + + uint8_t bw; + uint8_t factor; + uint8_t exponent; + + alp::state stt; + + dbl_arr = new (std::align_val_t {64}) double[1024]; + exc_arr = new (std::align_val_t {64}) double[1024]; + pos_arr = new (std::align_val_t {64}) uint16_t[1024]; + dig_arr = new (std::align_val_t {64}) int64_t[1024]; + dec_dbl_arr = new (std::align_val_t {64}) double[1024]; + exc_c_arr = new (std::align_val_t {64}) uint16_t[1024]; + ffor_arr = new (std::align_val_t {64}) int64_t[1024]; + unffor_arr = new (std::align_val_t {64}) int64_t[1024]; + base_arr = new (std::align_val_t {64}) int64_t[1024]; + + for (auto& dataset : alp_bench::datasets) { + std::ifstream ifile(dataset.sample_csv_file_path, std::ios::in); + + // check to see that the file was opened correctly: + if (!ifile.is_open()) { + exit(1); // exit or do additional error checking + } + + double num = 0.0; + // keep storing values from the text file so long as data exists: + size_t c {0}; + while (ifile >> num) { + dbl_arr[c] = num; + c += 1; + } + + factor = dataset.factor; + exponent = dataset.exponent; + + alp::AlpEncode::encode(dbl_arr, exc_arr, pos_arr, exc_c_arr, dig_arr, stt); + alp::AlpEncode::analyze_ffor(dig_arr, bw, base_arr); + alp::generated::ffor::fallback::scalar::ffor(dig_arr, ffor_arr, bw, base_arr); + + benchmark.Run(bench_alp_fused_decode( + dataset, unffor_arr, bw, base_arr, factor, exponent, dec_dbl_arr, exc_arr, pos_arr, exc_c_arr)); + + benchmark.Run(bench_alp_decode( + dataset, ffor_arr, unffor_arr, bw, base_arr, factor, exponent, dec_dbl_arr, exc_arr, pos_arr, exc_c_arr)); + + ifile.close();} +} +int main() +{ + benchmark::Benchmark benchmark = + benchmark::create("arm64v8_neon_intrinsic_1024_uf1_falp") + .save() + .at(std::string(SOURCE_DIR) + "/alp_pub/results/" + benchmark::CmakeInfo::getCmakeToolchainFile()) + .print() + .add_extra_info(benchmark::CmakeInfo::getCmakeInfo()); + benchmark_all(benchmark); +} diff --git a/generated/arm64v8/neon_intrinsic_uf1/arm64v8_neon_intrinsic_1024_uf1_falp_src.cpp b/generated/arm64v8/neon_intrinsic_uf1/arm64v8_neon_intrinsic_1024_uf1_falp_src.cpp new file mode 100644 index 0000000..8802c84 --- /dev/null +++ b/generated/arm64v8/neon_intrinsic_uf1/arm64v8_neon_intrinsic_1024_uf1_falp_src.cpp @@ -0,0 +1,29670 @@ +#include "alp/falp.hpp" +#include "alp/constants.hpp" +#include "fastlanes/macros.hpp" +#include +namespace generated +{ + namespace falp::arm64v8 + { + namespace neon + { + static void falp_0bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] uint64x2_t register_0; + [[maybe_unused]] uint64x2_t tmp_0; + [[maybe_unused]] int64x2_t base_0 = vmovq_n_u64(*(a_base_p)); + [[maybe_unused]] int64x2_t factor = vmovq_n_u64(alp::FACT_ARR[fac]); + [[maybe_unused]] float64x2_t frac10 = vmovq_n_f64(alp::Constants::FRAC_ARR[exp]); + [[maybe_unused]] float64x2_t tmp_dbl; + [[maybe_unused]] int64x2_t tmp_int; + for (int i = 0; i < 8; ++i) + { + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 0), base_0); + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 1), base_0); + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 2), base_0); + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 3), base_0); + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 4), base_0); + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 5), base_0); + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 6), base_0); + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 7), base_0); + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 8), base_0); + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 9), base_0); + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 10), base_0); + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 11), base_0); + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 12), base_0); + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 13), base_0); + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 14), base_0); + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 15), base_0); + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 16), base_0); + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 17), base_0); + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 18), base_0); + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 19), base_0); + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 20), base_0); + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 21), base_0); + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 22), base_0); + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 23), base_0); + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 24), base_0); + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 25), base_0); + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 26), base_0); + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 27), base_0); + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 28), base_0); + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 29), base_0); + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 30), base_0); + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 31), base_0); + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 32), base_0); + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 33), base_0); + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 34), base_0); + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 35), base_0); + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 36), base_0); + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 37), base_0); + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 38), base_0); + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 39), base_0); + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 40), base_0); + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 41), base_0); + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 42), base_0); + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 43), base_0); + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 44), base_0); + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 45), base_0); + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 46), base_0); + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 47), base_0); + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 48), base_0); + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 49), base_0); + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 50), base_0); + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 51), base_0); + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 52), base_0); + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 53), base_0); + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 54), base_0); + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 55), base_0); + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 56), base_0); + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 57), base_0); + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 58), base_0); + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 59), base_0); + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 60), base_0); + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 61), base_0); + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 62), base_0); + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 63), base_0); + } + } + static void falp_1bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] uint64x2_t register_0; + [[maybe_unused]] uint64x2_t tmp_0; + [[maybe_unused]] int64x2_t base_0 = vmovq_n_u64(*(a_base_p)); + [[maybe_unused]] int64x2_t factor = vmovq_n_u64(alp::FACT_ARR[fac]); + [[maybe_unused]] float64x2_t frac10 = vmovq_n_f64(alp::Constants::FRAC_ARR[exp]); + [[maybe_unused]] float64x2_t tmp_dbl; + [[maybe_unused]] int64x2_t tmp_int; + for (int i = 0; i < 8; ++i) + { + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 0); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 0), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 1), vdupq_n_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 1), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 2), vdupq_n_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 2), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 3), vdupq_n_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 3), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 4), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 5), vdupq_n_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 5), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 6), vdupq_n_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 6), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 7), vdupq_n_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 7), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 8), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 9), vdupq_n_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 9), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 10), vdupq_n_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 10), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 11), vdupq_n_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 11), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 12), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 13), vdupq_n_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 13), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 14), vdupq_n_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 14), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 15), vdupq_n_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 15), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 16), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 17), vdupq_n_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 17), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 18), vdupq_n_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 18), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 19), vdupq_n_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 19), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 20), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 21), vdupq_n_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 21), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 22), vdupq_n_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 22), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 23), vdupq_n_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 23), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 24), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 25), vdupq_n_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 25), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 26), vdupq_n_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 26), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 27), vdupq_n_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 27), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 28), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 29), vdupq_n_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 29), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 30), vdupq_n_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 30), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 31), vdupq_n_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 31), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 32), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 33), vdupq_n_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 33), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 34), vdupq_n_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 34), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 35), vdupq_n_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 35), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 36), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 37), vdupq_n_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 37), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 38), vdupq_n_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 38), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 39), vdupq_n_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 39), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 40), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 41), vdupq_n_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 41), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 42), vdupq_n_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 42), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 43), vdupq_n_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 43), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 44), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 45), vdupq_n_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 45), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 46), vdupq_n_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 46), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 47), vdupq_n_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 47), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 48), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 49), vdupq_n_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 49), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 50), vdupq_n_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 50), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 51), vdupq_n_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 51), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 52), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 53), vdupq_n_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 53), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 54), vdupq_n_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 54), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 55), vdupq_n_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 55), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 56), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 57), vdupq_n_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 57), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 58), vdupq_n_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 58), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 59), vdupq_n_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 59), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 60), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 61), vdupq_n_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 61), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 62), vdupq_n_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 62), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 63), vdupq_n_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 63), tmp_dbl); + } + } + static void falp_2bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] uint64x2_t register_0; + [[maybe_unused]] uint64x2_t tmp_0; + [[maybe_unused]] int64x2_t base_0 = vmovq_n_u64(*(a_base_p)); + [[maybe_unused]] int64x2_t factor = vmovq_n_u64(alp::FACT_ARR[fac]); + [[maybe_unused]] float64x2_t frac10 = vmovq_n_f64(alp::Constants::FRAC_ARR[exp]); + [[maybe_unused]] float64x2_t tmp_dbl; + [[maybe_unused]] int64x2_t tmp_int; + for (int i = 0; i < 8; ++i) + { + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 0); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 0), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 2), vdupq_n_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 1), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 2), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 6), vdupq_n_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 3), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 4), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 10), vdupq_n_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 5), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 6), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 14), vdupq_n_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 7), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 8), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 18), vdupq_n_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 9), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 10), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 22), vdupq_n_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 11), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 12), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 26), vdupq_n_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 13), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 14), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 30), vdupq_n_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 15), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 16), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 34), vdupq_n_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 17), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 18), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 38), vdupq_n_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 19), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 20), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 42), vdupq_n_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 21), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 22), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 46), vdupq_n_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 23), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 24), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 50), vdupq_n_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 25), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 26), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 54), vdupq_n_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 27), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 28), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 58), vdupq_n_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 29), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 30), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 62), vdupq_n_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 31), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 16); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 32), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 2), vdupq_n_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 33), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 34), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 6), vdupq_n_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 35), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 36), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 10), vdupq_n_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 37), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 38), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 14), vdupq_n_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 39), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 40), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 18), vdupq_n_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 41), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 42), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 22), vdupq_n_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 43), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 44), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 26), vdupq_n_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 45), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 46), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 30), vdupq_n_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 47), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 48), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 34), vdupq_n_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 49), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 50), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 38), vdupq_n_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 51), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 52), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 42), vdupq_n_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 53), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 54), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 46), vdupq_n_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 55), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 56), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 50), vdupq_n_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 57), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 58), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 54), vdupq_n_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 59), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 60), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 58), vdupq_n_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 61), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 62), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 62), vdupq_n_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 63), tmp_dbl); + } + } + static void falp_3bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] uint64x2_t register_0; + [[maybe_unused]] uint64x2_t tmp_0; + [[maybe_unused]] int64x2_t base_0 = vmovq_n_u64(*(a_base_p)); + [[maybe_unused]] int64x2_t factor = vmovq_n_u64(alp::FACT_ARR[fac]); + [[maybe_unused]] float64x2_t frac10 = vmovq_n_f64(alp::Constants::FRAC_ARR[exp]); + [[maybe_unused]] float64x2_t tmp_dbl; + [[maybe_unused]] int64x2_t tmp_int; + for (int i = 0; i < 8; ++i) + { + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 0); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 0), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 3), vdupq_n_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 1), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 6), vdupq_n_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 2), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 9), vdupq_n_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 3), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 4), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 15), vdupq_n_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 5), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 18), vdupq_n_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 6), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 21), vdupq_n_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 7), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 8), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 27), vdupq_n_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 9), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 30), vdupq_n_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 10), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 33), vdupq_n_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 11), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 12), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 39), vdupq_n_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 13), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 42), vdupq_n_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 14), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 45), vdupq_n_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 15), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 16), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 51), vdupq_n_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 17), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 54), vdupq_n_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 18), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 57), vdupq_n_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 19), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 20), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 63), vdupq_n_u64((1ULL << 1) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 16); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 2) - 1)) ,1), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 21), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 2), vdupq_n_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 22), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 5), vdupq_n_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 23), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 24), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 11), vdupq_n_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 25), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 14), vdupq_n_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 26), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 17), vdupq_n_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 27), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 28), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 23), vdupq_n_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 29), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 26), vdupq_n_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 30), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 29), vdupq_n_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 31), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 32), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 35), vdupq_n_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 33), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 38), vdupq_n_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 34), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 41), vdupq_n_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 35), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 36), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 47), vdupq_n_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 37), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 50), vdupq_n_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 38), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 53), vdupq_n_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 39), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 40), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 59), vdupq_n_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 41), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 62), vdupq_n_u64((1ULL << 2) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 32); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 1) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 42), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 1), vdupq_n_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 43), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 44), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 7), vdupq_n_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 45), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 10), vdupq_n_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 46), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 13), vdupq_n_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 47), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 48), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 19), vdupq_n_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 49), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 22), vdupq_n_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 50), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 25), vdupq_n_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 51), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 52), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 31), vdupq_n_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 53), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 34), vdupq_n_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 54), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 37), vdupq_n_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 55), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 56), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 43), vdupq_n_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 57), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 46), vdupq_n_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 58), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 49), vdupq_n_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 59), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 60), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 55), vdupq_n_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 61), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 58), vdupq_n_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 62), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 61), vdupq_n_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 63), tmp_dbl); + } + } + static void falp_4bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] uint64x2_t register_0; + [[maybe_unused]] uint64x2_t tmp_0; + [[maybe_unused]] int64x2_t base_0 = vmovq_n_u64(*(a_base_p)); + [[maybe_unused]] int64x2_t factor = vmovq_n_u64(alp::FACT_ARR[fac]); + [[maybe_unused]] float64x2_t frac10 = vmovq_n_f64(alp::Constants::FRAC_ARR[exp]); + [[maybe_unused]] float64x2_t tmp_dbl; + [[maybe_unused]] int64x2_t tmp_int; + for (int i = 0; i < 8; ++i) + { + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 0); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 0), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 1), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 2), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 3), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 4), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 5), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 6), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 7), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 8), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 9), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 10), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 11), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 12), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 13), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 14), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 15), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 16); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 16), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 17), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 18), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 19), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 20), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 21), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 22), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 23), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 24), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 25), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 26), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 27), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 28), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 29), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 30), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 31), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 32); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 32), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 33), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 34), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 35), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 36), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 37), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 38), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 39), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 40), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 41), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 42), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 43), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 44), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 45), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 46), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 47), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 48); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 48), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 49), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 50), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 51), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 52), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 53), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 54), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 55), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 56), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 57), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 58), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 59), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 60), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 61), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 62), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 63), tmp_dbl); + } + } + static void falp_5bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] uint64x2_t register_0; + [[maybe_unused]] uint64x2_t tmp_0; + [[maybe_unused]] int64x2_t base_0 = vmovq_n_u64(*(a_base_p)); + [[maybe_unused]] int64x2_t factor = vmovq_n_u64(alp::FACT_ARR[fac]); + [[maybe_unused]] float64x2_t frac10 = vmovq_n_f64(alp::Constants::FRAC_ARR[exp]); + [[maybe_unused]] float64x2_t tmp_dbl; + [[maybe_unused]] int64x2_t tmp_int; + for (int i = 0; i < 8; ++i) + { + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 0); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 0), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 5), vdupq_n_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 1), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 10), vdupq_n_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 2), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 15), vdupq_n_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 3), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 4), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 25), vdupq_n_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 5), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 30), vdupq_n_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 6), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 35), vdupq_n_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 7), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 8), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 45), vdupq_n_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 9), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 50), vdupq_n_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 10), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 55), vdupq_n_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 11), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 16); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 1) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 12), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 1), vdupq_n_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 13), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 6), vdupq_n_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 14), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 11), vdupq_n_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 15), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 16), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 21), vdupq_n_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 17), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 26), vdupq_n_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 18), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 31), vdupq_n_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 19), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 20), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 41), vdupq_n_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 21), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 46), vdupq_n_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 22), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 51), vdupq_n_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 23), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 24), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 61), vdupq_n_u64((1ULL << 3) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 32); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 2) - 1)) ,3), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 25), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 2), vdupq_n_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 26), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 7), vdupq_n_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 27), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 28), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 17), vdupq_n_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 29), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 22), vdupq_n_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 30), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 27), vdupq_n_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 31), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 32), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 37), vdupq_n_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 33), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 42), vdupq_n_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 34), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 47), vdupq_n_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 35), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 36), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 57), vdupq_n_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 37), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 62), vdupq_n_u64((1ULL << 2) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 48); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 3) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 38), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 3), vdupq_n_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 39), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 40), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 13), vdupq_n_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 41), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 18), vdupq_n_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 42), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 23), vdupq_n_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 43), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 44), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 33), vdupq_n_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 45), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 38), vdupq_n_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 46), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 43), vdupq_n_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 47), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 48), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 53), vdupq_n_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 49), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 58), vdupq_n_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 50), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 63), vdupq_n_u64((1ULL << 1) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 64); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,1), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 51), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 52), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 9), vdupq_n_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 53), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 14), vdupq_n_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 54), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 19), vdupq_n_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 55), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 56), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 29), vdupq_n_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 57), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 34), vdupq_n_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 58), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 39), vdupq_n_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 59), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 60), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 49), vdupq_n_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 61), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 54), vdupq_n_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 62), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 59), vdupq_n_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 63), tmp_dbl); + } + } + static void falp_6bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] uint64x2_t register_0; + [[maybe_unused]] uint64x2_t tmp_0; + [[maybe_unused]] int64x2_t base_0 = vmovq_n_u64(*(a_base_p)); + [[maybe_unused]] int64x2_t factor = vmovq_n_u64(alp::FACT_ARR[fac]); + [[maybe_unused]] float64x2_t frac10 = vmovq_n_f64(alp::Constants::FRAC_ARR[exp]); + [[maybe_unused]] float64x2_t tmp_dbl; + [[maybe_unused]] int64x2_t tmp_int; + for (int i = 0; i < 8; ++i) + { + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 0); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 0), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 6), vdupq_n_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 1), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 2), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 18), vdupq_n_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 3), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 4), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 30), vdupq_n_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 5), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 6), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 42), vdupq_n_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 7), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 8), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 54), vdupq_n_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 9), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 16); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 2) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 10), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 2), vdupq_n_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 11), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 12), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 14), vdupq_n_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 13), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 14), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 26), vdupq_n_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 15), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 16), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 38), vdupq_n_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 17), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 18), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 50), vdupq_n_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 19), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 20), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 62), vdupq_n_u64((1ULL << 2) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 32); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 21), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 22), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 10), vdupq_n_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 23), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 24), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 22), vdupq_n_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 25), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 26), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 34), vdupq_n_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 27), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 28), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 46), vdupq_n_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 29), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 30), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 58), vdupq_n_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 31), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 48); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 32), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 6), vdupq_n_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 33), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 34), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 18), vdupq_n_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 35), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 36), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 30), vdupq_n_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 37), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 38), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 42), vdupq_n_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 39), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 40), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 54), vdupq_n_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 41), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 64); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 2) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 42), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 2), vdupq_n_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 43), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 44), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 14), vdupq_n_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 45), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 46), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 26), vdupq_n_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 47), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 48), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 38), vdupq_n_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 49), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 50), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 50), vdupq_n_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 51), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 52), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 62), vdupq_n_u64((1ULL << 2) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 80); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 53), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 54), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 10), vdupq_n_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 55), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 56), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 22), vdupq_n_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 57), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 58), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 34), vdupq_n_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 59), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 60), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 46), vdupq_n_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 61), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 62), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 58), vdupq_n_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 63), tmp_dbl); + } + } + static void falp_7bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] uint64x2_t register_0; + [[maybe_unused]] uint64x2_t tmp_0; + [[maybe_unused]] int64x2_t base_0 = vmovq_n_u64(*(a_base_p)); + [[maybe_unused]] int64x2_t factor = vmovq_n_u64(alp::FACT_ARR[fac]); + [[maybe_unused]] float64x2_t frac10 = vmovq_n_f64(alp::Constants::FRAC_ARR[exp]); + [[maybe_unused]] float64x2_t tmp_dbl; + [[maybe_unused]] int64x2_t tmp_int; + for (int i = 0; i < 8; ++i) + { + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 0); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 0), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 7), vdupq_n_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 1), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 14), vdupq_n_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 2), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 21), vdupq_n_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 3), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 4), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 35), vdupq_n_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 5), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 42), vdupq_n_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 6), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 49), vdupq_n_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 7), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 8), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 63), vdupq_n_u64((1ULL << 1) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 16); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 6) - 1)) ,1), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 9), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 6), vdupq_n_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 10), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 13), vdupq_n_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 11), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 12), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 27), vdupq_n_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 13), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 34), vdupq_n_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 14), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 41), vdupq_n_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 15), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 16), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 55), vdupq_n_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 17), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 62), vdupq_n_u64((1ULL << 2) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 32); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 5) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 18), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 5), vdupq_n_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 19), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 20), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 19), vdupq_n_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 21), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 26), vdupq_n_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 22), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 33), vdupq_n_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 23), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 24), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 47), vdupq_n_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 25), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 54), vdupq_n_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 26), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 61), vdupq_n_u64((1ULL << 3) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 48); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,3), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 27), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 28), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 11), vdupq_n_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 29), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 18), vdupq_n_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 30), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 25), vdupq_n_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 31), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 32), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 39), vdupq_n_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 33), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 46), vdupq_n_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 34), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 53), vdupq_n_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 35), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 64); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 3) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 36), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 3), vdupq_n_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 37), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 10), vdupq_n_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 38), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 17), vdupq_n_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 39), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 40), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 31), vdupq_n_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 41), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 38), vdupq_n_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 42), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 45), vdupq_n_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 43), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 44), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 59), vdupq_n_u64((1ULL << 5) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 80); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 2) - 1)) ,5), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 45), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 2), vdupq_n_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 46), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 9), vdupq_n_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 47), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 48), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 23), vdupq_n_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 49), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 30), vdupq_n_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 50), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 37), vdupq_n_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 51), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 52), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 51), vdupq_n_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 53), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 58), vdupq_n_u64((1ULL << 6) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 96); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 1) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 54), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 1), vdupq_n_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 55), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 56), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 15), vdupq_n_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 57), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 22), vdupq_n_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 58), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 29), vdupq_n_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 59), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 60), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 43), vdupq_n_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 61), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 50), vdupq_n_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 62), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 57), vdupq_n_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 63), tmp_dbl); + } + } + static void falp_8bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] uint64x2_t register_0; + [[maybe_unused]] uint64x2_t tmp_0; + [[maybe_unused]] int64x2_t base_0 = vmovq_n_u64(*(a_base_p)); + [[maybe_unused]] int64x2_t factor = vmovq_n_u64(alp::FACT_ARR[fac]); + [[maybe_unused]] float64x2_t frac10 = vmovq_n_f64(alp::Constants::FRAC_ARR[exp]); + [[maybe_unused]] float64x2_t tmp_dbl; + [[maybe_unused]] int64x2_t tmp_int; + for (int i = 0; i < 8; ++i) + { + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 0); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 0), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 1), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 2), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 3), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 4), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 5), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 6), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 7), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 16); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 8), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 9), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 10), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 11), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 12), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 13), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 14), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 15), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 32); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 16), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 17), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 18), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 19), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 20), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 21), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 22), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 23), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 48); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 24), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 25), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 26), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 27), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 28), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 29), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 30), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 31), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 64); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 32), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 33), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 34), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 35), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 36), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 37), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 38), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 39), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 80); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 40), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 41), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 42), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 43), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 44), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 45), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 46), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 47), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 96); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 48), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 49), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 50), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 51), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 52), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 53), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 54), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 55), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 112); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 56), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 57), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 58), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 59), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 60), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 61), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 62), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 63), tmp_dbl); + } + } + static void falp_9bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] uint64x2_t register_0; + [[maybe_unused]] uint64x2_t tmp_0; + [[maybe_unused]] int64x2_t base_0 = vmovq_n_u64(*(a_base_p)); + [[maybe_unused]] int64x2_t factor = vmovq_n_u64(alp::FACT_ARR[fac]); + [[maybe_unused]] float64x2_t frac10 = vmovq_n_f64(alp::Constants::FRAC_ARR[exp]); + [[maybe_unused]] float64x2_t tmp_dbl; + [[maybe_unused]] int64x2_t tmp_int; + for (int i = 0; i < 8; ++i) + { + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 0); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 0), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 9), vdupq_n_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 1), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 18), vdupq_n_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 2), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 27), vdupq_n_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 3), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 4), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 45), vdupq_n_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 5), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 54), vdupq_n_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 6), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 63), vdupq_n_u64((1ULL << 1) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 16); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,1), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 7), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 8), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 17), vdupq_n_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 9), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 26), vdupq_n_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 10), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 35), vdupq_n_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 11), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 12), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 53), vdupq_n_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 13), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 62), vdupq_n_u64((1ULL << 2) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 32); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 7) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 14), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 7), vdupq_n_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 15), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 16), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 25), vdupq_n_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 17), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 34), vdupq_n_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 18), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 43), vdupq_n_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 19), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 20), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 61), vdupq_n_u64((1ULL << 3) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 48); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 6) - 1)) ,3), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 21), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 6), vdupq_n_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 22), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 15), vdupq_n_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 23), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 24), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 33), vdupq_n_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 25), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 42), vdupq_n_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 26), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 51), vdupq_n_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 27), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 64); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 5) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 28), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 5), vdupq_n_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 29), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 14), vdupq_n_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 30), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 23), vdupq_n_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 31), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 32), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 41), vdupq_n_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 33), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 50), vdupq_n_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 34), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 59), vdupq_n_u64((1ULL << 5) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 80); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,5), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 35), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 36), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 13), vdupq_n_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 37), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 22), vdupq_n_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 38), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 31), vdupq_n_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 39), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 40), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 49), vdupq_n_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 41), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 58), vdupq_n_u64((1ULL << 6) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 96); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 3) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 42), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 3), vdupq_n_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 43), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 44), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 21), vdupq_n_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 45), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 30), vdupq_n_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 46), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 39), vdupq_n_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 47), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 48), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 57), vdupq_n_u64((1ULL << 7) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 112); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 2) - 1)) ,7), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 49), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 2), vdupq_n_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 50), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 11), vdupq_n_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 51), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 52), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 29), vdupq_n_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 53), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 38), vdupq_n_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 54), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 47), vdupq_n_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 55), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 128); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 1) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 56), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 1), vdupq_n_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 57), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 10), vdupq_n_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 58), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 19), vdupq_n_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 59), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 60), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 37), vdupq_n_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 61), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 46), vdupq_n_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 62), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 55), vdupq_n_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 63), tmp_dbl); + } + } + static void falp_10bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] uint64x2_t register_0; + [[maybe_unused]] uint64x2_t tmp_0; + [[maybe_unused]] int64x2_t base_0 = vmovq_n_u64(*(a_base_p)); + [[maybe_unused]] int64x2_t factor = vmovq_n_u64(alp::FACT_ARR[fac]); + [[maybe_unused]] float64x2_t frac10 = vmovq_n_f64(alp::Constants::FRAC_ARR[exp]); + [[maybe_unused]] float64x2_t tmp_dbl; + [[maybe_unused]] int64x2_t tmp_int; + for (int i = 0; i < 8; ++i) + { + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 0); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 0), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 10), vdupq_n_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 1), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 2), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 30), vdupq_n_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 3), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 4), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 50), vdupq_n_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 5), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 16); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 6) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 6), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 6), vdupq_n_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 7), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 8), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 26), vdupq_n_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 9), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 10), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 46), vdupq_n_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 11), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 32); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 2) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 12), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 2), vdupq_n_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 13), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 14), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 22), vdupq_n_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 15), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 16), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 42), vdupq_n_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 17), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 18), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 62), vdupq_n_u64((1ULL << 2) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 48); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 19), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 20), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 18), vdupq_n_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 21), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 22), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 38), vdupq_n_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 23), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 24), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 58), vdupq_n_u64((1ULL << 6) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 64); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 25), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 26), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 14), vdupq_n_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 27), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 28), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 34), vdupq_n_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 29), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 30), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 54), vdupq_n_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 31), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 80); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 32), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 10), vdupq_n_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 33), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 34), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 30), vdupq_n_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 35), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 36), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 50), vdupq_n_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 37), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 96); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 6) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 38), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 6), vdupq_n_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 39), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 40), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 26), vdupq_n_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 41), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 42), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 46), vdupq_n_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 43), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 112); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 2) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 44), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 2), vdupq_n_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 45), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 46), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 22), vdupq_n_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 47), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 48), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 42), vdupq_n_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 49), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 50), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 62), vdupq_n_u64((1ULL << 2) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 128); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 51), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 52), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 18), vdupq_n_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 53), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 54), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 38), vdupq_n_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 55), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 56), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 58), vdupq_n_u64((1ULL << 6) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 144); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 57), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 58), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 14), vdupq_n_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 59), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 60), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 34), vdupq_n_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 61), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 62), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 54), vdupq_n_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 63), tmp_dbl); + } + } + static void falp_11bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] uint64x2_t register_0; + [[maybe_unused]] uint64x2_t tmp_0; + [[maybe_unused]] int64x2_t base_0 = vmovq_n_u64(*(a_base_p)); + [[maybe_unused]] int64x2_t factor = vmovq_n_u64(alp::FACT_ARR[fac]); + [[maybe_unused]] float64x2_t frac10 = vmovq_n_f64(alp::Constants::FRAC_ARR[exp]); + [[maybe_unused]] float64x2_t tmp_dbl; + [[maybe_unused]] int64x2_t tmp_int; + for (int i = 0; i < 8; ++i) + { + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 0); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 0), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 11), vdupq_n_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 1), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 22), vdupq_n_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 2), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 33), vdupq_n_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 3), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 4), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 55), vdupq_n_u64((1ULL << 9) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 16); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 2) - 1)) ,9), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 5), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 2), vdupq_n_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 6), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 13), vdupq_n_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 7), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 8), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 35), vdupq_n_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 9), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 46), vdupq_n_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 10), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 57), vdupq_n_u64((1ULL << 7) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 32); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,7), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 11), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 12), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 15), vdupq_n_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 13), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 26), vdupq_n_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 14), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 37), vdupq_n_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 15), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 16), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 59), vdupq_n_u64((1ULL << 5) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 48); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 6) - 1)) ,5), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 17), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 6), vdupq_n_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 18), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 17), vdupq_n_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 19), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 20), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 39), vdupq_n_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 21), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 50), vdupq_n_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 22), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 61), vdupq_n_u64((1ULL << 3) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 64); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,3), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 23), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 24), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 19), vdupq_n_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 25), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 30), vdupq_n_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 26), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 41), vdupq_n_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 27), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 28), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 63), vdupq_n_u64((1ULL << 1) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 80); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 10) - 1)) ,1), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 29), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 10), vdupq_n_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 30), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 21), vdupq_n_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 31), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 32), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 43), vdupq_n_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 33), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 54), vdupq_n_u64((1ULL << 10) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 96); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 1) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 34), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 1), vdupq_n_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 35), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 36), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 23), vdupq_n_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 37), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 34), vdupq_n_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 38), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 45), vdupq_n_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 39), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 112); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 3) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 40), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 3), vdupq_n_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 41), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 14), vdupq_n_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 42), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 25), vdupq_n_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 43), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 44), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 47), vdupq_n_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 45), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 58), vdupq_n_u64((1ULL << 6) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 128); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 5) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 46), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 5), vdupq_n_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 47), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 48), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 27), vdupq_n_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 49), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 38), vdupq_n_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 50), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 49), vdupq_n_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 51), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 144); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 7) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 52), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 7), vdupq_n_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 53), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 18), vdupq_n_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 54), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 29), vdupq_n_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 55), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 56), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 51), vdupq_n_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 57), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 62), vdupq_n_u64((1ULL << 2) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 160); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 9) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 58), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 9), vdupq_n_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 59), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 60), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 31), vdupq_n_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 61), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 42), vdupq_n_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 62), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 53), vdupq_n_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 63), tmp_dbl); + } + } + static void falp_12bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] uint64x2_t register_0; + [[maybe_unused]] uint64x2_t tmp_0; + [[maybe_unused]] int64x2_t base_0 = vmovq_n_u64(*(a_base_p)); + [[maybe_unused]] int64x2_t factor = vmovq_n_u64(alp::FACT_ARR[fac]); + [[maybe_unused]] float64x2_t frac10 = vmovq_n_f64(alp::Constants::FRAC_ARR[exp]); + [[maybe_unused]] float64x2_t tmp_dbl; + [[maybe_unused]] int64x2_t tmp_int; + for (int i = 0; i < 8; ++i) + { + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 0); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 0), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 1), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 2), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 3), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 4), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 16); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 5), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 6), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 7), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 8), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 9), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 32); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 10), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 11), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 12), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 13), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 14), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 15), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 48); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 16), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 17), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 18), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 19), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 20), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 64); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 21), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 22), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 23), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 24), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 25), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 80); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 26), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 27), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 28), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 29), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 30), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 31), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 96); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 32), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 33), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 34), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 35), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 36), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 112); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 37), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 38), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 39), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 40), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 41), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 128); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 42), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 43), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 44), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 45), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 46), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 47), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 144); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 48), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 49), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 50), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 51), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 52), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 160); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 53), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 54), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 55), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 56), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 57), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 176); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 58), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 59), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 60), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 61), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 62), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 63), tmp_dbl); + } + } + static void falp_13bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] uint64x2_t register_0; + [[maybe_unused]] uint64x2_t tmp_0; + [[maybe_unused]] int64x2_t base_0 = vmovq_n_u64(*(a_base_p)); + [[maybe_unused]] int64x2_t factor = vmovq_n_u64(alp::FACT_ARR[fac]); + [[maybe_unused]] float64x2_t frac10 = vmovq_n_f64(alp::Constants::FRAC_ARR[exp]); + [[maybe_unused]] float64x2_t tmp_dbl; + [[maybe_unused]] int64x2_t tmp_int; + for (int i = 0; i < 8; ++i) + { + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 0); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 0), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 13), vdupq_n_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 1), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 26), vdupq_n_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 2), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 39), vdupq_n_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 3), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 16); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 1) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 4), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 1), vdupq_n_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 5), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 14), vdupq_n_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 6), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 27), vdupq_n_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 7), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 8), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 53), vdupq_n_u64((1ULL << 11) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 32); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 2) - 1)) ,11), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 9), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 2), vdupq_n_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 10), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 15), vdupq_n_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 11), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 12), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 41), vdupq_n_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 13), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 54), vdupq_n_u64((1ULL << 10) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 48); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 3) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 14), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 3), vdupq_n_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 15), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 16), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 29), vdupq_n_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 17), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 42), vdupq_n_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 18), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 55), vdupq_n_u64((1ULL << 9) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 64); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,9), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 19), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 20), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 17), vdupq_n_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 21), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 30), vdupq_n_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 22), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 43), vdupq_n_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 23), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 80); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 5) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 24), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 5), vdupq_n_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 25), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 18), vdupq_n_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 26), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 31), vdupq_n_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 27), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 28), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 57), vdupq_n_u64((1ULL << 7) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 96); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 6) - 1)) ,7), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 29), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 6), vdupq_n_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 30), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 19), vdupq_n_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 31), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 32), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 45), vdupq_n_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 33), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 58), vdupq_n_u64((1ULL << 6) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 112); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 7) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 34), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 7), vdupq_n_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 35), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 36), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 33), vdupq_n_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 37), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 46), vdupq_n_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 38), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 59), vdupq_n_u64((1ULL << 5) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 128); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,5), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 39), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 40), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 21), vdupq_n_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 41), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 34), vdupq_n_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 42), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 47), vdupq_n_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 43), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 144); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 9) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 44), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 9), vdupq_n_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 45), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 22), vdupq_n_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 46), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 35), vdupq_n_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 47), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 48), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 61), vdupq_n_u64((1ULL << 3) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 160); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 10) - 1)) ,3), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 49), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 10), vdupq_n_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 50), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 23), vdupq_n_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 51), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 52), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 49), vdupq_n_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 53), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 62), vdupq_n_u64((1ULL << 2) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 176); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 11) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 54), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 11), vdupq_n_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 55), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 56), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 37), vdupq_n_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 57), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 50), vdupq_n_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 58), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 63), vdupq_n_u64((1ULL << 1) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 192); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,1), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 59), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 60), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 25), vdupq_n_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 61), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 38), vdupq_n_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 62), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 51), vdupq_n_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 63), tmp_dbl); + } + } + static void falp_14bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] uint64x2_t register_0; + [[maybe_unused]] uint64x2_t tmp_0; + [[maybe_unused]] int64x2_t base_0 = vmovq_n_u64(*(a_base_p)); + [[maybe_unused]] int64x2_t factor = vmovq_n_u64(alp::FACT_ARR[fac]); + [[maybe_unused]] float64x2_t frac10 = vmovq_n_f64(alp::Constants::FRAC_ARR[exp]); + [[maybe_unused]] float64x2_t tmp_dbl; + [[maybe_unused]] int64x2_t tmp_int; + for (int i = 0; i < 8; ++i) + { + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 0); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 0), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 14), vdupq_n_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 1), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 2), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 42), vdupq_n_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 3), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 16); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 6) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 4), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 6), vdupq_n_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 5), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 6), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 34), vdupq_n_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 7), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 8), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 62), vdupq_n_u64((1ULL << 2) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 32); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 9), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 10), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 26), vdupq_n_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 11), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 12), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 54), vdupq_n_u64((1ULL << 10) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 48); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 13), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 14), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 18), vdupq_n_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 15), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 16), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 46), vdupq_n_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 17), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 64); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 10) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 18), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 10), vdupq_n_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 19), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 20), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 38), vdupq_n_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 21), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 80); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 2) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 22), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 2), vdupq_n_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 23), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 24), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 30), vdupq_n_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 25), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 26), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 58), vdupq_n_u64((1ULL << 6) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 96); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 27), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 28), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 22), vdupq_n_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 29), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 30), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 50), vdupq_n_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 31), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 112); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 32), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 14), vdupq_n_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 33), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 34), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 42), vdupq_n_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 35), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 128); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 6) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 36), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 6), vdupq_n_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 37), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 38), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 34), vdupq_n_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 39), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 40), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 62), vdupq_n_u64((1ULL << 2) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 144); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 41), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 42), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 26), vdupq_n_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 43), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 44), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 54), vdupq_n_u64((1ULL << 10) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 160); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 45), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 46), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 18), vdupq_n_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 47), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 48), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 46), vdupq_n_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 49), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 176); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 10) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 50), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 10), vdupq_n_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 51), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 52), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 38), vdupq_n_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 53), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 192); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 2) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 54), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 2), vdupq_n_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 55), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 56), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 30), vdupq_n_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 57), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 58), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 58), vdupq_n_u64((1ULL << 6) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 208); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 59), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 60), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 22), vdupq_n_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 61), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 62), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 50), vdupq_n_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 63), tmp_dbl); + } + } + static void falp_15bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] uint64x2_t register_0; + [[maybe_unused]] uint64x2_t tmp_0; + [[maybe_unused]] int64x2_t base_0 = vmovq_n_u64(*(a_base_p)); + [[maybe_unused]] int64x2_t factor = vmovq_n_u64(alp::FACT_ARR[fac]); + [[maybe_unused]] float64x2_t frac10 = vmovq_n_f64(alp::Constants::FRAC_ARR[exp]); + [[maybe_unused]] float64x2_t tmp_dbl; + [[maybe_unused]] int64x2_t tmp_int; + for (int i = 0; i < 8; ++i) + { + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 0); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 0), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 15), vdupq_n_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 1), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 30), vdupq_n_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 2), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 45), vdupq_n_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 3), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 16); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 11) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 4), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 11), vdupq_n_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 5), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 26), vdupq_n_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 6), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 41), vdupq_n_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 7), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 32); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 7) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 8), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 7), vdupq_n_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 9), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 22), vdupq_n_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 10), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 37), vdupq_n_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 11), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 48); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 3) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 12), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 3), vdupq_n_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 13), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 18), vdupq_n_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 14), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 33), vdupq_n_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 15), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 16), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 63), vdupq_n_u64((1ULL << 1) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 64); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 14) - 1)) ,1), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 17), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 14), vdupq_n_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 18), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 29), vdupq_n_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 19), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 20), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 59), vdupq_n_u64((1ULL << 5) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 80); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 10) - 1)) ,5), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 21), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 10), vdupq_n_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 22), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 25), vdupq_n_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 23), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 24), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 55), vdupq_n_u64((1ULL << 9) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 96); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 6) - 1)) ,9), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 25), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 6), vdupq_n_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 26), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 21), vdupq_n_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 27), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 28), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 51), vdupq_n_u64((1ULL << 13) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 112); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 2) - 1)) ,13), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 29), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 2), vdupq_n_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 30), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 17), vdupq_n_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 31), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 32), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 47), vdupq_n_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 33), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 62), vdupq_n_u64((1ULL << 2) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 128); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 13) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 34), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 13), vdupq_n_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 35), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 36), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 43), vdupq_n_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 37), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 58), vdupq_n_u64((1ULL << 6) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 144); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 9) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 38), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 9), vdupq_n_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 39), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 40), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 39), vdupq_n_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 41), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 54), vdupq_n_u64((1ULL << 10) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 160); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 5) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 42), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 5), vdupq_n_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 43), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 44), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 35), vdupq_n_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 45), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 50), vdupq_n_u64((1ULL << 14) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 176); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 1) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 46), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 1), vdupq_n_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 47), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 48), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 31), vdupq_n_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 49), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 46), vdupq_n_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 50), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 61), vdupq_n_u64((1ULL << 3) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 192); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,3), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 51), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 52), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 27), vdupq_n_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 53), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 42), vdupq_n_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 54), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 57), vdupq_n_u64((1ULL << 7) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 208); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,7), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 55), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 56), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 23), vdupq_n_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 57), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 38), vdupq_n_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 58), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 53), vdupq_n_u64((1ULL << 11) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 224); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,11), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 59), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 60), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 19), vdupq_n_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 61), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 34), vdupq_n_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 62), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 49), vdupq_n_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 63), tmp_dbl); + } + } + static void falp_16bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] uint64x2_t register_0; + [[maybe_unused]] uint64x2_t tmp_0; + [[maybe_unused]] int64x2_t base_0 = vmovq_n_u64(*(a_base_p)); + [[maybe_unused]] int64x2_t factor = vmovq_n_u64(alp::FACT_ARR[fac]); + [[maybe_unused]] float64x2_t frac10 = vmovq_n_f64(alp::Constants::FRAC_ARR[exp]); + [[maybe_unused]] float64x2_t tmp_dbl; + [[maybe_unused]] int64x2_t tmp_int; + for (int i = 0; i < 8; ++i) + { + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 0); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 0), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 1), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 2), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 3), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 16); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 4), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 5), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 6), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 7), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 32); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 8), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 9), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 10), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 11), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 48); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 12), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 13), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 14), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 15), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 64); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 16), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 17), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 18), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 19), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 80); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 20), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 21), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 22), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 23), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 96); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 24), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 25), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 26), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 27), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 112); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 28), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 29), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 30), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 31), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 128); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 32), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 33), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 34), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 35), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 144); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 36), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 37), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 38), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 39), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 160); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 40), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 41), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 42), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 43), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 176); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 44), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 45), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 46), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 47), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 192); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 48), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 49), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 50), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 51), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 208); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 52), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 53), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 54), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 55), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 224); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 56), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 57), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 58), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 59), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 240); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 60), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 61), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 62), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 63), tmp_dbl); + } + } + static void falp_17bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] uint64x2_t register_0; + [[maybe_unused]] uint64x2_t tmp_0; + [[maybe_unused]] int64x2_t base_0 = vmovq_n_u64(*(a_base_p)); + [[maybe_unused]] int64x2_t factor = vmovq_n_u64(alp::FACT_ARR[fac]); + [[maybe_unused]] float64x2_t frac10 = vmovq_n_f64(alp::Constants::FRAC_ARR[exp]); + [[maybe_unused]] float64x2_t tmp_dbl; + [[maybe_unused]] int64x2_t tmp_int; + for (int i = 0; i < 8; ++i) + { + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 0); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 0), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 17), vdupq_n_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 1), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 34), vdupq_n_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 2), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 51), vdupq_n_u64((1ULL << 13) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 16); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,13), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 3), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 4), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 21), vdupq_n_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 5), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 38), vdupq_n_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 6), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 55), vdupq_n_u64((1ULL << 9) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 32); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,9), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 7), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 8), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 25), vdupq_n_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 9), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 42), vdupq_n_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 10), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 59), vdupq_n_u64((1ULL << 5) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 48); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,5), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 11), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 12), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 29), vdupq_n_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 13), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 46), vdupq_n_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 14), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 63), vdupq_n_u64((1ULL << 1) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 64); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,1), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 15), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 16), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 33), vdupq_n_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 17), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 50), vdupq_n_u64((1ULL << 14) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 80); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 3) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 18), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 3), vdupq_n_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 19), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 20), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 37), vdupq_n_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 21), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 54), vdupq_n_u64((1ULL << 10) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 96); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 7) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 22), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 7), vdupq_n_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 23), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 24), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 41), vdupq_n_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 25), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 58), vdupq_n_u64((1ULL << 6) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 112); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 11) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 26), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 11), vdupq_n_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 27), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 28), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 45), vdupq_n_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 29), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 62), vdupq_n_u64((1ULL << 2) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 128); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 15) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 30), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 15), vdupq_n_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 31), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 32), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 49), vdupq_n_u64((1ULL << 15) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 144); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 2) - 1)) ,15), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 33), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 2), vdupq_n_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 34), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 19), vdupq_n_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 35), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 36), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 53), vdupq_n_u64((1ULL << 11) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 160); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 6) - 1)) ,11), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 37), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 6), vdupq_n_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 38), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 23), vdupq_n_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 39), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 40), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 57), vdupq_n_u64((1ULL << 7) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 176); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 10) - 1)) ,7), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 41), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 10), vdupq_n_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 42), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 27), vdupq_n_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 43), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 44), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 61), vdupq_n_u64((1ULL << 3) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 192); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 14) - 1)) ,3), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 45), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 14), vdupq_n_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 46), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 31), vdupq_n_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 47), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 208); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 1) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 48), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 1), vdupq_n_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 49), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 18), vdupq_n_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 50), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 35), vdupq_n_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 51), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 224); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 5) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 52), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 5), vdupq_n_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 53), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 22), vdupq_n_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 54), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 39), vdupq_n_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 55), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 240); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 9) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 56), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 9), vdupq_n_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 57), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 26), vdupq_n_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 58), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 43), vdupq_n_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 59), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 256); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 13) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 60), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 13), vdupq_n_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 61), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 30), vdupq_n_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 62), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 47), vdupq_n_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 63), tmp_dbl); + } + } + static void falp_18bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] uint64x2_t register_0; + [[maybe_unused]] uint64x2_t tmp_0; + [[maybe_unused]] int64x2_t base_0 = vmovq_n_u64(*(a_base_p)); + [[maybe_unused]] int64x2_t factor = vmovq_n_u64(alp::FACT_ARR[fac]); + [[maybe_unused]] float64x2_t frac10 = vmovq_n_f64(alp::Constants::FRAC_ARR[exp]); + [[maybe_unused]] float64x2_t tmp_dbl; + [[maybe_unused]] int64x2_t tmp_int; + for (int i = 0; i < 8; ++i) + { + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 0); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 0), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 18), vdupq_n_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 1), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 2), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 54), vdupq_n_u64((1ULL << 10) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 16); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 3), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 4), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 26), vdupq_n_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 5), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 6), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 62), vdupq_n_u64((1ULL << 2) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 32); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 7), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 8), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 34), vdupq_n_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 9), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 48); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 6) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 10), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 6), vdupq_n_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 11), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 12), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 42), vdupq_n_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 13), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 64); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 14) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 14), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 14), vdupq_n_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 15), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 16), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 50), vdupq_n_u64((1ULL << 14) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 80); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 17), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 18), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 22), vdupq_n_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 19), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 20), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 58), vdupq_n_u64((1ULL << 6) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 96); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 21), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 22), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 30), vdupq_n_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 23), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 112); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 2) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 24), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 2), vdupq_n_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 25), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 26), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 38), vdupq_n_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 27), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 128); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 10) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 28), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 10), vdupq_n_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 29), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 30), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 46), vdupq_n_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 31), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 144); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 32), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 18), vdupq_n_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 33), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 34), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 54), vdupq_n_u64((1ULL << 10) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 160); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 35), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 36), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 26), vdupq_n_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 37), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 38), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 62), vdupq_n_u64((1ULL << 2) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 176); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 39), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 40), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 34), vdupq_n_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 41), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 192); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 6) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 42), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 6), vdupq_n_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 43), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 44), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 42), vdupq_n_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 45), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 208); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 14) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 46), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 14), vdupq_n_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 47), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 48), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 50), vdupq_n_u64((1ULL << 14) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 224); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 49), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 50), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 22), vdupq_n_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 51), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 52), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 58), vdupq_n_u64((1ULL << 6) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 240); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 53), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 54), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 30), vdupq_n_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 55), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 256); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 2) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 56), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 2), vdupq_n_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 57), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 58), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 38), vdupq_n_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 59), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 272); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 10) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 60), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 10), vdupq_n_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 61), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 62), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 46), vdupq_n_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 63), tmp_dbl); + } + } + static void falp_19bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] uint64x2_t register_0; + [[maybe_unused]] uint64x2_t tmp_0; + [[maybe_unused]] int64x2_t base_0 = vmovq_n_u64(*(a_base_p)); + [[maybe_unused]] int64x2_t factor = vmovq_n_u64(alp::FACT_ARR[fac]); + [[maybe_unused]] float64x2_t frac10 = vmovq_n_f64(alp::Constants::FRAC_ARR[exp]); + [[maybe_unused]] float64x2_t tmp_dbl; + [[maybe_unused]] int64x2_t tmp_int; + for (int i = 0; i < 8; ++i) + { + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 0); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 0), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 19), vdupq_n_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 1), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 38), vdupq_n_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 2), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 57), vdupq_n_u64((1ULL << 7) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 16); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,7), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 3), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 4), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 31), vdupq_n_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 5), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 50), vdupq_n_u64((1ULL << 14) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 32); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 5) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 6), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 5), vdupq_n_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 7), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 8), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 43), vdupq_n_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 9), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 62), vdupq_n_u64((1ULL << 2) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 48); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 17) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 10), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 17), vdupq_n_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 11), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 12), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 55), vdupq_n_u64((1ULL << 9) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 64); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 10) - 1)) ,9), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 13), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 10), vdupq_n_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 14), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 29), vdupq_n_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 15), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 80); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 3) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 16), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 3), vdupq_n_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 17), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 22), vdupq_n_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 18), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 41), vdupq_n_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 19), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 96); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 15) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 20), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 15), vdupq_n_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 21), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 34), vdupq_n_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 22), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 53), vdupq_n_u64((1ULL << 11) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 112); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,11), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 23), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 24), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 27), vdupq_n_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 25), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 46), vdupq_n_u64((1ULL << 18) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 128); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 1) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 26), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 1), vdupq_n_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 27), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 28), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 39), vdupq_n_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 29), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 58), vdupq_n_u64((1ULL << 6) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 144); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 13) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 30), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 13), vdupq_n_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 31), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 32), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 51), vdupq_n_u64((1ULL << 13) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 160); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 6) - 1)) ,13), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 33), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 6), vdupq_n_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 34), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 25), vdupq_n_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 35), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 36), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 63), vdupq_n_u64((1ULL << 1) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 176); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 18) - 1)) ,1), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 37), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 18), vdupq_n_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 38), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 37), vdupq_n_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 39), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 192); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 11) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 40), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 11), vdupq_n_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 41), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 30), vdupq_n_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 42), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 49), vdupq_n_u64((1ULL << 15) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 208); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,15), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 43), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 44), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 23), vdupq_n_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 45), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 42), vdupq_n_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 46), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 61), vdupq_n_u64((1ULL << 3) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 224); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,3), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 47), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 48), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 35), vdupq_n_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 49), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 54), vdupq_n_u64((1ULL << 10) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 240); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 9) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 50), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 9), vdupq_n_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 51), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 52), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 47), vdupq_n_u64((1ULL << 17) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 256); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 2) - 1)) ,17), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 53), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 2), vdupq_n_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 54), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 21), vdupq_n_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 55), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 56), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 59), vdupq_n_u64((1ULL << 5) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 272); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 14) - 1)) ,5), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 57), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 14), vdupq_n_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 58), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 33), vdupq_n_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 59), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 288); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 7) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 60), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 7), vdupq_n_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 61), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 26), vdupq_n_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 62), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 45), vdupq_n_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 63), tmp_dbl); + } + } + static void falp_20bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] uint64x2_t register_0; + [[maybe_unused]] uint64x2_t tmp_0; + [[maybe_unused]] int64x2_t base_0 = vmovq_n_u64(*(a_base_p)); + [[maybe_unused]] int64x2_t factor = vmovq_n_u64(alp::FACT_ARR[fac]); + [[maybe_unused]] float64x2_t frac10 = vmovq_n_f64(alp::Constants::FRAC_ARR[exp]); + [[maybe_unused]] float64x2_t tmp_dbl; + [[maybe_unused]] int64x2_t tmp_int; + for (int i = 0; i < 8; ++i) + { + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 0); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 0), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 1), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 2), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 16); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 3), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 4), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 5), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 32); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 6), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 7), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 8), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 48); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 9), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 10), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 11), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 64); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 12), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 13), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 14), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 15), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 80); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 16), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 17), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 18), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 96); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 19), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 20), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 21), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 112); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 22), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 23), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 24), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 128); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 25), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 26), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 27), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 144); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 28), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 29), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 30), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 31), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 160); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 32), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 33), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 34), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 176); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 35), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 36), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 37), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 192); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 38), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 39), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 40), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 208); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 41), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 42), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 43), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 224); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 44), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 45), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 46), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 47), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 240); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 48), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 49), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 50), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 256); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 51), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 52), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 53), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 272); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 54), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 55), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 56), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 288); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 57), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 58), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 59), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 304); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 60), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 61), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 62), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 63), tmp_dbl); + } + } + static void falp_21bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] uint64x2_t register_0; + [[maybe_unused]] uint64x2_t tmp_0; + [[maybe_unused]] int64x2_t base_0 = vmovq_n_u64(*(a_base_p)); + [[maybe_unused]] int64x2_t factor = vmovq_n_u64(alp::FACT_ARR[fac]); + [[maybe_unused]] float64x2_t frac10 = vmovq_n_f64(alp::Constants::FRAC_ARR[exp]); + [[maybe_unused]] float64x2_t tmp_dbl; + [[maybe_unused]] int64x2_t tmp_int; + for (int i = 0; i < 8; ++i) + { + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 0); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 0), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 21), vdupq_n_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 1), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 42), vdupq_n_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 2), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 63), vdupq_n_u64((1ULL << 1) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 16); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 20) - 1)) ,1), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 3), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 4), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 41), vdupq_n_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 5), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 62), vdupq_n_u64((1ULL << 2) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 32); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 19) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 6), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 19), vdupq_n_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 7), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 8), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 61), vdupq_n_u64((1ULL << 3) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 48); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 18) - 1)) ,3), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 9), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 18), vdupq_n_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 10), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 39), vdupq_n_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 11), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 64); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 17) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 12), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 17), vdupq_n_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 13), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 38), vdupq_n_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 14), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 59), vdupq_n_u64((1ULL << 5) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 80); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,5), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 15), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 16), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 37), vdupq_n_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 17), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 58), vdupq_n_u64((1ULL << 6) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 96); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 15) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 18), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 15), vdupq_n_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 19), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 20), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 57), vdupq_n_u64((1ULL << 7) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 112); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 14) - 1)) ,7), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 21), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 14), vdupq_n_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 22), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 35), vdupq_n_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 23), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 128); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 13) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 24), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 13), vdupq_n_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 25), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 34), vdupq_n_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 26), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 55), vdupq_n_u64((1ULL << 9) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 144); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,9), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 27), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 28), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 33), vdupq_n_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 29), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 54), vdupq_n_u64((1ULL << 10) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 160); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 11) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 30), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 11), vdupq_n_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 31), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 32), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 53), vdupq_n_u64((1ULL << 11) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 176); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 10) - 1)) ,11), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 33), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 10), vdupq_n_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 34), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 31), vdupq_n_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 35), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 192); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 9) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 36), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 9), vdupq_n_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 37), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 30), vdupq_n_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 38), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 51), vdupq_n_u64((1ULL << 13) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 208); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,13), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 39), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 40), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 29), vdupq_n_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 41), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 50), vdupq_n_u64((1ULL << 14) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 224); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 7) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 42), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 7), vdupq_n_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 43), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 44), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 49), vdupq_n_u64((1ULL << 15) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 240); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 6) - 1)) ,15), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 45), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 6), vdupq_n_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 46), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 27), vdupq_n_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 47), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 256); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 5) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 48), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 5), vdupq_n_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 49), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 26), vdupq_n_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 50), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 47), vdupq_n_u64((1ULL << 17) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 272); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,17), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 51), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 52), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 25), vdupq_n_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 53), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 46), vdupq_n_u64((1ULL << 18) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 288); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 3) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 54), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 3), vdupq_n_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 55), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 56), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 45), vdupq_n_u64((1ULL << 19) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 304); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 2) - 1)) ,19), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 57), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 2), vdupq_n_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 58), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 23), vdupq_n_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 59), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 20) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 320); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 1) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 60), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 1), vdupq_n_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 61), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 22), vdupq_n_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 62), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 43), vdupq_n_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 63), tmp_dbl); + } + } + static void falp_22bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] uint64x2_t register_0; + [[maybe_unused]] uint64x2_t tmp_0; + [[maybe_unused]] int64x2_t base_0 = vmovq_n_u64(*(a_base_p)); + [[maybe_unused]] int64x2_t factor = vmovq_n_u64(alp::FACT_ARR[fac]); + [[maybe_unused]] float64x2_t frac10 = vmovq_n_f64(alp::Constants::FRAC_ARR[exp]); + [[maybe_unused]] float64x2_t tmp_dbl; + [[maybe_unused]] int64x2_t tmp_int; + for (int i = 0; i < 8; ++i) + { + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 0); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 0), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 22), vdupq_n_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 1), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 20) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 16); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 2) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 2), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 2), vdupq_n_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 3), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 4), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 46), vdupq_n_u64((1ULL << 18) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 32); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 5), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 6), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 26), vdupq_n_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 7), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 48); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 6) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 8), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 6), vdupq_n_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 9), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 10), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 50), vdupq_n_u64((1ULL << 14) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 64); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 11), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 12), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 30), vdupq_n_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 13), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 80); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 10) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 14), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 10), vdupq_n_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 15), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 16), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 54), vdupq_n_u64((1ULL << 10) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 96); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 17), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 18), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 34), vdupq_n_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 19), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 112); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 14) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 20), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 14), vdupq_n_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 21), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 22), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 58), vdupq_n_u64((1ULL << 6) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 128); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 23), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 24), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 38), vdupq_n_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 25), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 144); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 18) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 26), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 18), vdupq_n_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 27), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 28), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 62), vdupq_n_u64((1ULL << 2) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 160); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 20) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 29), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 30), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 42), vdupq_n_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 31), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 176); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 32), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 22), vdupq_n_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 33), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 20) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 192); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 2) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 34), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 2), vdupq_n_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 35), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 36), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 46), vdupq_n_u64((1ULL << 18) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 208); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 37), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 38), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 26), vdupq_n_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 39), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 224); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 6) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 40), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 6), vdupq_n_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 41), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 42), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 50), vdupq_n_u64((1ULL << 14) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 240); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 43), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 44), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 30), vdupq_n_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 45), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 256); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 10) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 46), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 10), vdupq_n_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 47), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 48), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 54), vdupq_n_u64((1ULL << 10) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 272); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 49), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 50), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 34), vdupq_n_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 51), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 288); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 14) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 52), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 14), vdupq_n_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 53), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 54), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 58), vdupq_n_u64((1ULL << 6) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 304); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 55), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 56), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 38), vdupq_n_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 57), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 320); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 18) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 58), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 18), vdupq_n_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 59), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 60), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 62), vdupq_n_u64((1ULL << 2) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 336); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 20) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 61), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 62), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 42), vdupq_n_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 63), tmp_dbl); + } + } + static void falp_23bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] uint64x2_t register_0; + [[maybe_unused]] uint64x2_t tmp_0; + [[maybe_unused]] int64x2_t base_0 = vmovq_n_u64(*(a_base_p)); + [[maybe_unused]] int64x2_t factor = vmovq_n_u64(alp::FACT_ARR[fac]); + [[maybe_unused]] float64x2_t frac10 = vmovq_n_f64(alp::Constants::FRAC_ARR[exp]); + [[maybe_unused]] float64x2_t tmp_dbl; + [[maybe_unused]] int64x2_t tmp_int; + for (int i = 0; i < 8; ++i) + { + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 0); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 0), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 23), vdupq_n_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 1), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 46), vdupq_n_u64((1ULL << 18) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 16); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 5) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 2), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 5), vdupq_n_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 3), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 4), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 51), vdupq_n_u64((1ULL << 13) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 32); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 10) - 1)) ,13), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 5), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 10), vdupq_n_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 6), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 33), vdupq_n_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 7), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 48); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 15) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 8), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 15), vdupq_n_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 9), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 38), vdupq_n_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 10), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 61), vdupq_n_u64((1ULL << 3) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 64); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 20) - 1)) ,3), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 11), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 12), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 43), vdupq_n_u64((1ULL << 21) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 80); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 2) - 1)) ,21), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 13), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 2), vdupq_n_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 14), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 25), vdupq_n_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 15), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 96); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 7) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 16), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 7), vdupq_n_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 17), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 30), vdupq_n_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 18), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 53), vdupq_n_u64((1ULL << 11) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 112); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,11), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 19), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 20), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 35), vdupq_n_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 21), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 58), vdupq_n_u64((1ULL << 6) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 128); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 17) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 22), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 17), vdupq_n_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 23), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 24), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 63), vdupq_n_u64((1ULL << 1) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 144); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 22) - 1)) ,1), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 25), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 22), vdupq_n_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 26), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 45), vdupq_n_u64((1ULL << 19) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 160); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,19), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 27), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 28), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 27), vdupq_n_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 29), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 50), vdupq_n_u64((1ULL << 14) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 176); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 9) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 30), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 9), vdupq_n_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 31), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 32), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 55), vdupq_n_u64((1ULL << 9) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 192); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 14) - 1)) ,9), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 33), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 14), vdupq_n_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 34), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 37), vdupq_n_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 35), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 208); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 19) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 36), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 19), vdupq_n_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 37), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 42), vdupq_n_u64((1ULL << 22) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 224); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 1) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 38), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 1), vdupq_n_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 39), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 40), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 47), vdupq_n_u64((1ULL << 17) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 240); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 6) - 1)) ,17), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 41), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 6), vdupq_n_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 42), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 29), vdupq_n_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 43), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 256); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 11) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 44), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 11), vdupq_n_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 45), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 34), vdupq_n_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 46), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 57), vdupq_n_u64((1ULL << 7) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 272); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,7), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 47), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 48), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 39), vdupq_n_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 49), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 62), vdupq_n_u64((1ULL << 2) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 288); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 21) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 50), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 21), vdupq_n_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 51), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 20) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 304); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 3) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 52), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 3), vdupq_n_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 53), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 26), vdupq_n_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 54), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 49), vdupq_n_u64((1ULL << 15) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 320); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,15), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 55), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 56), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 31), vdupq_n_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 57), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 54), vdupq_n_u64((1ULL << 10) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 336); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 13) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 58), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 13), vdupq_n_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 59), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 60), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 59), vdupq_n_u64((1ULL << 5) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 352); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 18) - 1)) ,5), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 61), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 18), vdupq_n_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 62), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 41), vdupq_n_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 63), tmp_dbl); + } + } + static void falp_24bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] uint64x2_t register_0; + [[maybe_unused]] uint64x2_t tmp_0; + [[maybe_unused]] int64x2_t base_0 = vmovq_n_u64(*(a_base_p)); + [[maybe_unused]] int64x2_t factor = vmovq_n_u64(alp::FACT_ARR[fac]); + [[maybe_unused]] float64x2_t frac10 = vmovq_n_f64(alp::Constants::FRAC_ARR[exp]); + [[maybe_unused]] float64x2_t tmp_dbl; + [[maybe_unused]] int64x2_t tmp_int; + for (int i = 0; i < 8; ++i) + { + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 0); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 0), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 1), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 16); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 2), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 3), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 4), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 32); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 5), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 6), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 7), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 48); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 8), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 9), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 64); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 10), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 11), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 12), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 80); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 13), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 14), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 15), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 96); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 16), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 17), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 112); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 18), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 19), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 20), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 128); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 21), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 22), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 23), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 144); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 24), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 25), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 160); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 26), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 27), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 28), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 176); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 29), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 30), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 31), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 192); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 32), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 33), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 208); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 34), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 35), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 36), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 224); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 37), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 38), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 39), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 240); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 40), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 41), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 256); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 42), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 43), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 44), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 272); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 45), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 46), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 47), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 288); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 48), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 49), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 304); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 50), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 51), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 52), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 320); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 53), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 54), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 55), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 336); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 56), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 57), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 352); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 58), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 59), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 60), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 368); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 61), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 62), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 63), tmp_dbl); + } + } + static void falp_25bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] uint64x2_t register_0; + [[maybe_unused]] uint64x2_t tmp_0; + [[maybe_unused]] int64x2_t base_0 = vmovq_n_u64(*(a_base_p)); + [[maybe_unused]] int64x2_t factor = vmovq_n_u64(alp::FACT_ARR[fac]); + [[maybe_unused]] float64x2_t frac10 = vmovq_n_f64(alp::Constants::FRAC_ARR[exp]); + [[maybe_unused]] float64x2_t tmp_dbl; + [[maybe_unused]] int64x2_t tmp_int; + for (int i = 0; i < 8; ++i) + { + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 0); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 0), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 25), vdupq_n_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 1), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 50), vdupq_n_u64((1ULL << 14) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 16); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 11) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 2), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 11), vdupq_n_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 3), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 4), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 61), vdupq_n_u64((1ULL << 3) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 32); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 22) - 1)) ,3), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 5), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 22), vdupq_n_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 6), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 47), vdupq_n_u64((1ULL << 17) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 48); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,17), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 7), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 8), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 33), vdupq_n_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 9), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 58), vdupq_n_u64((1ULL << 6) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 64); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 19) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 10), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 19), vdupq_n_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 11), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 20) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 80); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 5) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 12), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 5), vdupq_n_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 13), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 30), vdupq_n_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 14), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 55), vdupq_n_u64((1ULL << 9) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 96); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,9), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 15), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 16), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 41), vdupq_n_u64((1ULL << 23) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 112); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 2) - 1)) ,23), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 17), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 2), vdupq_n_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 18), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 27), vdupq_n_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 19), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 128); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 13) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 20), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 13), vdupq_n_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 21), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 38), vdupq_n_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 22), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 63), vdupq_n_u64((1ULL << 1) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 144); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,1), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 23), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 24), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 49), vdupq_n_u64((1ULL << 15) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 160); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 10) - 1)) ,15), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 25), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 10), vdupq_n_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 26), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 35), vdupq_n_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 27), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 176); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 21) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 28), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 21), vdupq_n_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 29), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 46), vdupq_n_u64((1ULL << 18) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 192); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 7) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 30), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 7), vdupq_n_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 31), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 32), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 57), vdupq_n_u64((1ULL << 7) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 208); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 18) - 1)) ,7), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 33), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 18), vdupq_n_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 34), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 43), vdupq_n_u64((1ULL << 21) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 224); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,21), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 35), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 36), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 29), vdupq_n_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 37), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 54), vdupq_n_u64((1ULL << 10) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 240); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 15) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 38), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 15), vdupq_n_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 39), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 256); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 1) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 40), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 1), vdupq_n_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 41), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 26), vdupq_n_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 42), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 51), vdupq_n_u64((1ULL << 13) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 272); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,13), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 43), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 44), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 37), vdupq_n_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 45), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 62), vdupq_n_u64((1ULL << 2) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 288); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 23) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 46), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 23), vdupq_n_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 47), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 304); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 9) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 48), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 9), vdupq_n_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 49), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 34), vdupq_n_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 50), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 59), vdupq_n_u64((1ULL << 5) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 320); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 20) - 1)) ,5), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 51), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 52), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 45), vdupq_n_u64((1ULL << 19) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 336); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 6) - 1)) ,19), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 53), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 6), vdupq_n_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 54), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 31), vdupq_n_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 55), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 352); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 17) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 56), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 17), vdupq_n_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 57), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 42), vdupq_n_u64((1ULL << 22) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 368); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 3) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 58), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 3), vdupq_n_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 59), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 60), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 53), vdupq_n_u64((1ULL << 11) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 384); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 14) - 1)) ,11), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 61), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 14), vdupq_n_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 62), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 39), vdupq_n_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 63), tmp_dbl); + } + } + static void falp_26bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] uint64x2_t register_0; + [[maybe_unused]] uint64x2_t tmp_0; + [[maybe_unused]] int64x2_t base_0 = vmovq_n_u64(*(a_base_p)); + [[maybe_unused]] int64x2_t factor = vmovq_n_u64(alp::FACT_ARR[fac]); + [[maybe_unused]] float64x2_t frac10 = vmovq_n_f64(alp::Constants::FRAC_ARR[exp]); + [[maybe_unused]] float64x2_t tmp_dbl; + [[maybe_unused]] int64x2_t tmp_int; + for (int i = 0; i < 8; ++i) + { + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 0); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 0), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 26), vdupq_n_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 1), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 16); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 14) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 2), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 14), vdupq_n_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 3), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 32); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 2) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 4), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 2), vdupq_n_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 5), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 6), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 54), vdupq_n_u64((1ULL << 10) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 48); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 7), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 8), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 42), vdupq_n_u64((1ULL << 22) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 64); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 9), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 10), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 30), vdupq_n_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 11), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 80); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 18) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 12), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 18), vdupq_n_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 13), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 20) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 96); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 6) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 14), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 6), vdupq_n_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 15), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 16), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 58), vdupq_n_u64((1ULL << 6) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 112); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 20) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 17), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 18), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 46), vdupq_n_u64((1ULL << 18) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 128); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 19), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 20), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 34), vdupq_n_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 21), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 144); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 22) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 22), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 22), vdupq_n_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 23), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 160); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 10) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 24), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 10), vdupq_n_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 25), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 26), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 62), vdupq_n_u64((1ULL << 2) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 176); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 27), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 28), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 50), vdupq_n_u64((1ULL << 14) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 192); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 29), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 30), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 38), vdupq_n_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 31), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 208); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 32), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 26), vdupq_n_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 33), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 224); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 14) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 34), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 14), vdupq_n_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 35), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 240); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 2) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 36), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 2), vdupq_n_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 37), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 38), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 54), vdupq_n_u64((1ULL << 10) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 256); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 39), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 40), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 42), vdupq_n_u64((1ULL << 22) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 272); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 41), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 42), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 30), vdupq_n_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 43), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 288); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 18) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 44), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 18), vdupq_n_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 45), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 20) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 304); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 6) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 46), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 6), vdupq_n_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 47), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 48), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 58), vdupq_n_u64((1ULL << 6) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 320); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 20) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 49), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 50), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 46), vdupq_n_u64((1ULL << 18) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 336); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 51), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 52), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 34), vdupq_n_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 53), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 352); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 22) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 54), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 22), vdupq_n_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 55), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 368); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 10) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 56), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 10), vdupq_n_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 57), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 58), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 62), vdupq_n_u64((1ULL << 2) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 384); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 59), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 60), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 50), vdupq_n_u64((1ULL << 14) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 400); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 61), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 62), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 38), vdupq_n_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 63), tmp_dbl); + } + } + static void falp_27bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] uint64x2_t register_0; + [[maybe_unused]] uint64x2_t tmp_0; + [[maybe_unused]] int64x2_t base_0 = vmovq_n_u64(*(a_base_p)); + [[maybe_unused]] int64x2_t factor = vmovq_n_u64(alp::FACT_ARR[fac]); + [[maybe_unused]] float64x2_t frac10 = vmovq_n_f64(alp::Constants::FRAC_ARR[exp]); + [[maybe_unused]] float64x2_t tmp_dbl; + [[maybe_unused]] int64x2_t tmp_int; + for (int i = 0; i < 8; ++i) + { + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 0); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 0), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 27), vdupq_n_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 1), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 54), vdupq_n_u64((1ULL << 10) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 16); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 17) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 2), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 17), vdupq_n_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 3), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 20) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 32); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 7) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 4), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 7), vdupq_n_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 5), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 34), vdupq_n_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 6), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 61), vdupq_n_u64((1ULL << 3) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 48); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,3), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 7), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 8), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 51), vdupq_n_u64((1ULL << 13) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 64); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 14) - 1)) ,13), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 9), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 14), vdupq_n_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 10), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 41), vdupq_n_u64((1ULL << 23) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 80); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,23), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 11), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 12), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 31), vdupq_n_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 13), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 58), vdupq_n_u64((1ULL << 6) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 96); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 21) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 14), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 21), vdupq_n_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 15), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 112); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 11) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 16), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 11), vdupq_n_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 17), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 38), vdupq_n_u64((1ULL << 26) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 128); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 1) - 1)) ,26), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 18), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 1), vdupq_n_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 19), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 20), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 55), vdupq_n_u64((1ULL << 9) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 144); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 18) - 1)) ,9), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 21), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 18), vdupq_n_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 22), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 45), vdupq_n_u64((1ULL << 19) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 160); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,19), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 23), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 24), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 35), vdupq_n_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 25), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 62), vdupq_n_u64((1ULL << 2) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 176); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 25) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 26), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 25), vdupq_n_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 27), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 192); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 15) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 28), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 15), vdupq_n_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 29), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 42), vdupq_n_u64((1ULL << 22) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 208); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 5) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 30), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 5), vdupq_n_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 31), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 32), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 59), vdupq_n_u64((1ULL << 5) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 224); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 22) - 1)) ,5), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 33), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 22), vdupq_n_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 34), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 49), vdupq_n_u64((1ULL << 15) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 240); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,15), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 35), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 36), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 39), vdupq_n_u64((1ULL << 25) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 256); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 2) - 1)) ,25), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 37), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 2), vdupq_n_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 38), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 29), vdupq_n_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 39), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 272); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 19) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 40), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 19), vdupq_n_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 41), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 46), vdupq_n_u64((1ULL << 18) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 288); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 9) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 42), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 9), vdupq_n_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 43), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 44), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 63), vdupq_n_u64((1ULL << 1) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 304); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 26) - 1)) ,1), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 45), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 26), vdupq_n_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 46), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 53), vdupq_n_u64((1ULL << 11) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 320); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,11), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 47), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 48), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 43), vdupq_n_u64((1ULL << 21) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 336); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 6) - 1)) ,21), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 49), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 6), vdupq_n_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 50), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 33), vdupq_n_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 51), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 352); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 23) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 52), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 23), vdupq_n_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 53), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 50), vdupq_n_u64((1ULL << 14) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 368); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 13) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 54), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 13), vdupq_n_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 55), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 384); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 3) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 56), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 3), vdupq_n_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 57), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 30), vdupq_n_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 58), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 57), vdupq_n_u64((1ULL << 7) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 400); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 20) - 1)) ,7), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 59), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 60), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 47), vdupq_n_u64((1ULL << 17) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 416); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 10) - 1)) ,17), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 61), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 10), vdupq_n_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 62), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 37), vdupq_n_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 63), tmp_dbl); + } + } + static void falp_28bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] uint64x2_t register_0; + [[maybe_unused]] uint64x2_t tmp_0; + [[maybe_unused]] int64x2_t base_0 = vmovq_n_u64(*(a_base_p)); + [[maybe_unused]] int64x2_t factor = vmovq_n_u64(alp::FACT_ARR[fac]); + [[maybe_unused]] float64x2_t frac10 = vmovq_n_f64(alp::Constants::FRAC_ARR[exp]); + [[maybe_unused]] float64x2_t tmp_dbl; + [[maybe_unused]] int64x2_t tmp_int; + for (int i = 0; i < 8; ++i) + { + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 0); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 0), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 1), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 16); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 20) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 2), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 3), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 32); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 4), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 5), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 48); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 6), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 7), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 8), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 64); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 9), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 10), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 80); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 11), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 12), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 20) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 96); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 13), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 14), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 15), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 112); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 16), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 17), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 128); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 20) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 18), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 19), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 144); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 20), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 21), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 160); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 22), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 23), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 24), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 176); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 25), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 26), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 192); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 27), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 28), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 20) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 208); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 29), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 30), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 31), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 224); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 32), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 33), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 240); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 20) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 34), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 35), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 256); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 36), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 37), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 272); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 38), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 39), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 40), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 288); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 41), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 42), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 304); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 43), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 44), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 20) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 320); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 45), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 46), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 47), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 336); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 48), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 49), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 352); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 20) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 50), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 51), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 368); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 52), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 53), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 384); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 54), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 55), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 56), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 400); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 57), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 58), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 416); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 59), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 60), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 20) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 432); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 61), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 62), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 63), tmp_dbl); + } + } + static void falp_29bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] uint64x2_t register_0; + [[maybe_unused]] uint64x2_t tmp_0; + [[maybe_unused]] int64x2_t base_0 = vmovq_n_u64(*(a_base_p)); + [[maybe_unused]] int64x2_t factor = vmovq_n_u64(alp::FACT_ARR[fac]); + [[maybe_unused]] float64x2_t frac10 = vmovq_n_f64(alp::Constants::FRAC_ARR[exp]); + [[maybe_unused]] float64x2_t tmp_dbl; + [[maybe_unused]] int64x2_t tmp_int; + for (int i = 0; i < 8; ++i) + { + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 0); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 29) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 0), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 29), vdupq_n_u64((1ULL << 29) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 1), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 58), vdupq_n_u64((1ULL << 6) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 16); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 23) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 2), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 23), vdupq_n_u64((1ULL << 29) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 3), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 32); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 17) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 4), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 17), vdupq_n_u64((1ULL << 29) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 5), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 46), vdupq_n_u64((1ULL << 18) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 48); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 11) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 6), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 11), vdupq_n_u64((1ULL << 29) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 7), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 64); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 5) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 8), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 5), vdupq_n_u64((1ULL << 29) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 9), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 34), vdupq_n_u64((1ULL << 29) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 10), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 63), vdupq_n_u64((1ULL << 1) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 80); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 28) - 1)) ,1), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 11), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 29) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 12), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 57), vdupq_n_u64((1ULL << 7) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 96); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 22) - 1)) ,7), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 13), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 22), vdupq_n_u64((1ULL << 29) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 14), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 51), vdupq_n_u64((1ULL << 13) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 112); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,13), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 15), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 29) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 16), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 45), vdupq_n_u64((1ULL << 19) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 128); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 10) - 1)) ,19), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 17), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 10), vdupq_n_u64((1ULL << 29) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 18), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 39), vdupq_n_u64((1ULL << 25) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 144); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,25), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 19), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 29) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 20), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 33), vdupq_n_u64((1ULL << 29) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 21), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 62), vdupq_n_u64((1ULL << 2) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 160); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 27) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 22), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 27), vdupq_n_u64((1ULL << 29) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 23), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 176); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 21) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 24), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 21), vdupq_n_u64((1ULL << 29) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 25), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 50), vdupq_n_u64((1ULL << 14) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 192); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 15) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 26), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 15), vdupq_n_u64((1ULL << 29) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 27), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 20) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 208); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 9) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 28), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 9), vdupq_n_u64((1ULL << 29) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 29), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 38), vdupq_n_u64((1ULL << 26) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 224); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 3) - 1)) ,26), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 30), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 3), vdupq_n_u64((1ULL << 29) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 31), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 29) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 32), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 61), vdupq_n_u64((1ULL << 3) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 240); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 26) - 1)) ,3), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 33), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 26), vdupq_n_u64((1ULL << 29) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 34), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 55), vdupq_n_u64((1ULL << 9) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 256); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 20) - 1)) ,9), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 35), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 29) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 36), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 49), vdupq_n_u64((1ULL << 15) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 272); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 14) - 1)) ,15), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 37), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 14), vdupq_n_u64((1ULL << 29) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 38), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 43), vdupq_n_u64((1ULL << 21) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 288); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,21), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 39), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 29) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 40), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 37), vdupq_n_u64((1ULL << 27) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 304); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 2) - 1)) ,27), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 41), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 2), vdupq_n_u64((1ULL << 29) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 42), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 31), vdupq_n_u64((1ULL << 29) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 43), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 320); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 25) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 44), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 25), vdupq_n_u64((1ULL << 29) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 45), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 54), vdupq_n_u64((1ULL << 10) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 336); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 19) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 46), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 19), vdupq_n_u64((1ULL << 29) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 47), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 352); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 13) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 48), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 13), vdupq_n_u64((1ULL << 29) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 49), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 42), vdupq_n_u64((1ULL << 22) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 368); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 7) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 50), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 7), vdupq_n_u64((1ULL << 29) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 51), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 28) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 384); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 1) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 52), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 1), vdupq_n_u64((1ULL << 29) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 53), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 30), vdupq_n_u64((1ULL << 29) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 54), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 59), vdupq_n_u64((1ULL << 5) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 400); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,5), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 55), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 29) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 56), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 53), vdupq_n_u64((1ULL << 11) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 416); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 18) - 1)) ,11), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 57), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 18), vdupq_n_u64((1ULL << 29) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 58), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 47), vdupq_n_u64((1ULL << 17) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 432); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,17), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 59), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 29) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 60), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 41), vdupq_n_u64((1ULL << 23) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 448); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 6) - 1)) ,23), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 61), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 6), vdupq_n_u64((1ULL << 29) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 62), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 35), vdupq_n_u64((1ULL << 29) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 63), tmp_dbl); + } + } + static void falp_30bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] uint64x2_t register_0; + [[maybe_unused]] uint64x2_t tmp_0; + [[maybe_unused]] int64x2_t base_0 = vmovq_n_u64(*(a_base_p)); + [[maybe_unused]] int64x2_t factor = vmovq_n_u64(alp::FACT_ARR[fac]); + [[maybe_unused]] float64x2_t frac10 = vmovq_n_f64(alp::Constants::FRAC_ARR[exp]); + [[maybe_unused]] float64x2_t tmp_dbl; + [[maybe_unused]] int64x2_t tmp_int; + for (int i = 0; i < 8; ++i) + { + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 0); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 30) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 0), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 30), vdupq_n_u64((1ULL << 30) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 1), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 16); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 26) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 2), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 26), vdupq_n_u64((1ULL << 30) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 3), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 32); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 22) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 4), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 22), vdupq_n_u64((1ULL << 30) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 5), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 48); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 18) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 6), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 18), vdupq_n_u64((1ULL << 30) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 7), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 64); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 14) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 8), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 14), vdupq_n_u64((1ULL << 30) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 9), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 20) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 80); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 10) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 10), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 10), vdupq_n_u64((1ULL << 30) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 11), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 96); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 6) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 12), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 6), vdupq_n_u64((1ULL << 30) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 13), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 28) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 112); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 2) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 14), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 2), vdupq_n_u64((1ULL << 30) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 15), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 30) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 16), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 62), vdupq_n_u64((1ULL << 2) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 128); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 28) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 17), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 30) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 18), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 58), vdupq_n_u64((1ULL << 6) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 144); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 19), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 30) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 20), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 54), vdupq_n_u64((1ULL << 10) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 160); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 20) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 21), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 30) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 22), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 50), vdupq_n_u64((1ULL << 14) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 176); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 23), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 30) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 24), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 46), vdupq_n_u64((1ULL << 18) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 192); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 25), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 30) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 26), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 42), vdupq_n_u64((1ULL << 22) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 208); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 27), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 30) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 28), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 38), vdupq_n_u64((1ULL << 26) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 224); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,26), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 29), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 30) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 30), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 34), vdupq_n_u64((1ULL << 30) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 31), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 240); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 30) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 32), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 30), vdupq_n_u64((1ULL << 30) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 33), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 256); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 26) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 34), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 26), vdupq_n_u64((1ULL << 30) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 35), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 272); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 22) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 36), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 22), vdupq_n_u64((1ULL << 30) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 37), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 288); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 18) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 38), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 18), vdupq_n_u64((1ULL << 30) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 39), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 304); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 14) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 40), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 14), vdupq_n_u64((1ULL << 30) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 41), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 20) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 320); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 10) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 42), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 10), vdupq_n_u64((1ULL << 30) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 43), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 336); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 6) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 44), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 6), vdupq_n_u64((1ULL << 30) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 45), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 28) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 352); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 2) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 46), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 2), vdupq_n_u64((1ULL << 30) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 47), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 30) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 48), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 62), vdupq_n_u64((1ULL << 2) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 368); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 28) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 49), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 30) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 50), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 58), vdupq_n_u64((1ULL << 6) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 384); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 51), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 30) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 52), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 54), vdupq_n_u64((1ULL << 10) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 400); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 20) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 53), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 30) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 54), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 50), vdupq_n_u64((1ULL << 14) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 416); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 55), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 30) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 56), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 46), vdupq_n_u64((1ULL << 18) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 432); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 57), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 30) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 58), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 42), vdupq_n_u64((1ULL << 22) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 448); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 59), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 30) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 60), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 38), vdupq_n_u64((1ULL << 26) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 464); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,26), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 61), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 30) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 62), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 34), vdupq_n_u64((1ULL << 30) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 63), tmp_dbl); + } + } + static void falp_31bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] uint64x2_t register_0; + [[maybe_unused]] uint64x2_t tmp_0; + [[maybe_unused]] int64x2_t base_0 = vmovq_n_u64(*(a_base_p)); + [[maybe_unused]] int64x2_t factor = vmovq_n_u64(alp::FACT_ARR[fac]); + [[maybe_unused]] float64x2_t frac10 = vmovq_n_f64(alp::Constants::FRAC_ARR[exp]); + [[maybe_unused]] float64x2_t tmp_dbl; + [[maybe_unused]] int64x2_t tmp_int; + for (int i = 0; i < 8; ++i) + { + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 0); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 31) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 0), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 31), vdupq_n_u64((1ULL << 31) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 1), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 62), vdupq_n_u64((1ULL << 2) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 16); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 29) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 2), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 29), vdupq_n_u64((1ULL << 31) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 3), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 32); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 27) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 4), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 27), vdupq_n_u64((1ULL << 31) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 5), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 58), vdupq_n_u64((1ULL << 6) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 48); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 25) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 6), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 25), vdupq_n_u64((1ULL << 31) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 7), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 64); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 23) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 8), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 23), vdupq_n_u64((1ULL << 31) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 9), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 54), vdupq_n_u64((1ULL << 10) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 80); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 21) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 10), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 21), vdupq_n_u64((1ULL << 31) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 11), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 96); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 19) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 12), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 19), vdupq_n_u64((1ULL << 31) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 13), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 50), vdupq_n_u64((1ULL << 14) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 112); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 17) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 14), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 17), vdupq_n_u64((1ULL << 31) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 15), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 128); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 15) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 16), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 15), vdupq_n_u64((1ULL << 31) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 17), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 46), vdupq_n_u64((1ULL << 18) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 144); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 13) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 18), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 13), vdupq_n_u64((1ULL << 31) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 19), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 20) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 160); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 11) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 20), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 11), vdupq_n_u64((1ULL << 31) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 21), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 42), vdupq_n_u64((1ULL << 22) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 176); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 9) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 22), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 9), vdupq_n_u64((1ULL << 31) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 23), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 192); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 7) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 24), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 7), vdupq_n_u64((1ULL << 31) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 25), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 38), vdupq_n_u64((1ULL << 26) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 208); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 5) - 1)) ,26), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 26), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 5), vdupq_n_u64((1ULL << 31) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 27), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 28) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 224); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 3) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 28), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 3), vdupq_n_u64((1ULL << 31) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 29), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 34), vdupq_n_u64((1ULL << 30) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 240); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 1) - 1)) ,30), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 30), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 1), vdupq_n_u64((1ULL << 31) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 31), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 31) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 32), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 63), vdupq_n_u64((1ULL << 1) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 256); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 30) - 1)) ,1), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 33), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 30), vdupq_n_u64((1ULL << 31) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 34), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 61), vdupq_n_u64((1ULL << 3) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 272); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 28) - 1)) ,3), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 35), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 31) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 36), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 59), vdupq_n_u64((1ULL << 5) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 288); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 26) - 1)) ,5), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 37), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 26), vdupq_n_u64((1ULL << 31) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 38), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 57), vdupq_n_u64((1ULL << 7) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 304); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,7), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 39), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 31) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 40), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 55), vdupq_n_u64((1ULL << 9) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 320); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 22) - 1)) ,9), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 41), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 22), vdupq_n_u64((1ULL << 31) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 42), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 53), vdupq_n_u64((1ULL << 11) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 336); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 20) - 1)) ,11), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 43), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 31) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 44), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 51), vdupq_n_u64((1ULL << 13) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 352); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 18) - 1)) ,13), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 45), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 18), vdupq_n_u64((1ULL << 31) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 46), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 49), vdupq_n_u64((1ULL << 15) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 368); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,15), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 47), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 31) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 48), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 47), vdupq_n_u64((1ULL << 17) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 384); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 14) - 1)) ,17), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 49), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 14), vdupq_n_u64((1ULL << 31) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 50), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 45), vdupq_n_u64((1ULL << 19) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 400); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,19), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 51), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 31) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 52), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 43), vdupq_n_u64((1ULL << 21) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 416); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 10) - 1)) ,21), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 53), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 10), vdupq_n_u64((1ULL << 31) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 54), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 41), vdupq_n_u64((1ULL << 23) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 432); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,23), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 55), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 31) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 56), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 39), vdupq_n_u64((1ULL << 25) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 448); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 6) - 1)) ,25), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 57), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 6), vdupq_n_u64((1ULL << 31) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 58), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 37), vdupq_n_u64((1ULL << 27) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 464); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,27), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 59), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 31) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 60), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 35), vdupq_n_u64((1ULL << 29) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 480); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 2) - 1)) ,29), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 61), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 2), vdupq_n_u64((1ULL << 31) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 62), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 33), vdupq_n_u64((1ULL << 31) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 63), tmp_dbl); + } + } + static void falp_32bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] uint64x2_t register_0; + [[maybe_unused]] uint64x2_t tmp_0; + [[maybe_unused]] int64x2_t base_0 = vmovq_n_u64(*(a_base_p)); + [[maybe_unused]] int64x2_t factor = vmovq_n_u64(alp::FACT_ARR[fac]); + [[maybe_unused]] float64x2_t frac10 = vmovq_n_f64(alp::Constants::FRAC_ARR[exp]); + [[maybe_unused]] float64x2_t tmp_dbl; + [[maybe_unused]] int64x2_t tmp_int; + for (int i = 0; i < 8; ++i) + { + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 0); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 0), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 1), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 16); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 2), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 3), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 32); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 4), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 5), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 48); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 6), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 7), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 64); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 8), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 9), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 80); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 10), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 11), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 96); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 12), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 13), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 112); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 14), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 15), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 128); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 16), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 17), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 144); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 18), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 19), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 160); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 20), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 21), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 176); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 22), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 23), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 192); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 24), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 25), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 208); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 26), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 27), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 224); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 28), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 29), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 240); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 30), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 31), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 256); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 32), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 33), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 272); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 34), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 35), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 288); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 36), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 37), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 304); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 38), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 39), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 320); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 40), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 41), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 336); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 42), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 43), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 352); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 44), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 45), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 368); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 46), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 47), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 384); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 48), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 49), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 400); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 50), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 51), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 416); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 52), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 53), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 432); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 54), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 55), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 448); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 56), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 57), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 464); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 58), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 59), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 480); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 60), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 61), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 496); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 62), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 63), tmp_dbl); + } + } + static void falp_33bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] uint64x2_t register_0; + [[maybe_unused]] uint64x2_t tmp_0; + [[maybe_unused]] int64x2_t base_0 = vmovq_n_u64(*(a_base_p)); + [[maybe_unused]] int64x2_t factor = vmovq_n_u64(alp::FACT_ARR[fac]); + [[maybe_unused]] float64x2_t frac10 = vmovq_n_f64(alp::Constants::FRAC_ARR[exp]); + [[maybe_unused]] float64x2_t tmp_dbl; + [[maybe_unused]] int64x2_t tmp_int; + for (int i = 0; i < 8; ++i) + { + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 0); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 33) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 0), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 33), vdupq_n_u64((1ULL << 31) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 16); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 2) - 1)) ,31), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 1), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 2), vdupq_n_u64((1ULL << 33) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 2), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 35), vdupq_n_u64((1ULL << 29) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 32); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,29), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 3), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 33) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 4), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 37), vdupq_n_u64((1ULL << 27) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 48); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 6) - 1)) ,27), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 5), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 6), vdupq_n_u64((1ULL << 33) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 6), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 39), vdupq_n_u64((1ULL << 25) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 64); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,25), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 7), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 33) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 8), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 41), vdupq_n_u64((1ULL << 23) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 80); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 10) - 1)) ,23), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 9), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 10), vdupq_n_u64((1ULL << 33) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 10), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 43), vdupq_n_u64((1ULL << 21) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 96); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,21), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 11), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 33) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 12), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 45), vdupq_n_u64((1ULL << 19) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 112); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 14) - 1)) ,19), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 13), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 14), vdupq_n_u64((1ULL << 33) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 14), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 47), vdupq_n_u64((1ULL << 17) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 128); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,17), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 15), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 33) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 16), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 49), vdupq_n_u64((1ULL << 15) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 144); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 18) - 1)) ,15), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 17), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 18), vdupq_n_u64((1ULL << 33) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 18), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 51), vdupq_n_u64((1ULL << 13) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 160); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 20) - 1)) ,13), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 19), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 33) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 20), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 53), vdupq_n_u64((1ULL << 11) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 176); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 22) - 1)) ,11), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 21), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 22), vdupq_n_u64((1ULL << 33) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 22), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 55), vdupq_n_u64((1ULL << 9) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 192); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,9), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 23), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 33) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 24), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 57), vdupq_n_u64((1ULL << 7) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 208); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 26) - 1)) ,7), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 25), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 26), vdupq_n_u64((1ULL << 33) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 26), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 59), vdupq_n_u64((1ULL << 5) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 224); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 28) - 1)) ,5), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 27), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 33) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 28), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 61), vdupq_n_u64((1ULL << 3) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 240); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 30) - 1)) ,3), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 29), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 30), vdupq_n_u64((1ULL << 33) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 30), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 63), vdupq_n_u64((1ULL << 1) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 256); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,1), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 31), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 272); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 1) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 32), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 1), vdupq_n_u64((1ULL << 33) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 33), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 34), vdupq_n_u64((1ULL << 30) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 288); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 3) - 1)) ,30), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 34), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 3), vdupq_n_u64((1ULL << 33) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 35), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 28) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 304); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 5) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 36), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 5), vdupq_n_u64((1ULL << 33) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 37), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 38), vdupq_n_u64((1ULL << 26) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 320); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 7) - 1)) ,26), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 38), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 7), vdupq_n_u64((1ULL << 33) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 39), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 336); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 9) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 40), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 9), vdupq_n_u64((1ULL << 33) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 41), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 42), vdupq_n_u64((1ULL << 22) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 352); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 11) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 42), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 11), vdupq_n_u64((1ULL << 33) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 43), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 20) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 368); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 13) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 44), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 13), vdupq_n_u64((1ULL << 33) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 45), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 46), vdupq_n_u64((1ULL << 18) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 384); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 15) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 46), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 15), vdupq_n_u64((1ULL << 33) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 47), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 400); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 17) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 48), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 17), vdupq_n_u64((1ULL << 33) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 49), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 50), vdupq_n_u64((1ULL << 14) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 416); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 19) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 50), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 19), vdupq_n_u64((1ULL << 33) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 51), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 432); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 21) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 52), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 21), vdupq_n_u64((1ULL << 33) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 53), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 54), vdupq_n_u64((1ULL << 10) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 448); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 23) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 54), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 23), vdupq_n_u64((1ULL << 33) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 55), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 464); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 25) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 56), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 25), vdupq_n_u64((1ULL << 33) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 57), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 58), vdupq_n_u64((1ULL << 6) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 480); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 27) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 58), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 27), vdupq_n_u64((1ULL << 33) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 59), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 496); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 29) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 60), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 29), vdupq_n_u64((1ULL << 33) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 61), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 62), vdupq_n_u64((1ULL << 2) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 512); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 31) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 62), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 31), vdupq_n_u64((1ULL << 33) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 63), tmp_dbl); + } + } + static void falp_34bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] uint64x2_t register_0; + [[maybe_unused]] uint64x2_t tmp_0; + [[maybe_unused]] int64x2_t base_0 = vmovq_n_u64(*(a_base_p)); + [[maybe_unused]] int64x2_t factor = vmovq_n_u64(alp::FACT_ARR[fac]); + [[maybe_unused]] float64x2_t frac10 = vmovq_n_f64(alp::Constants::FRAC_ARR[exp]); + [[maybe_unused]] float64x2_t tmp_dbl; + [[maybe_unused]] int64x2_t tmp_int; + for (int i = 0; i < 8; ++i) + { + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 0); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 34) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 0), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 34), vdupq_n_u64((1ULL << 30) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 16); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,30), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 1), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 34) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 2), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 38), vdupq_n_u64((1ULL << 26) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 32); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,26), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 3), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 34) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 4), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 42), vdupq_n_u64((1ULL << 22) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 48); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 5), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 34) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 6), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 46), vdupq_n_u64((1ULL << 18) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 64); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 7), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 34) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 8), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 50), vdupq_n_u64((1ULL << 14) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 80); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 20) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 9), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 34) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 10), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 54), vdupq_n_u64((1ULL << 10) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 96); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 11), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 34) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 12), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 58), vdupq_n_u64((1ULL << 6) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 112); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 28) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 13), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 34) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 14), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 62), vdupq_n_u64((1ULL << 2) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 128); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 15), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 144); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 2) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 16), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 2), vdupq_n_u64((1ULL << 34) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 17), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 28) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 160); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 6) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 18), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 6), vdupq_n_u64((1ULL << 34) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 19), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 176); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 10) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 20), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 10), vdupq_n_u64((1ULL << 34) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 21), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 20) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 192); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 14) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 22), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 14), vdupq_n_u64((1ULL << 34) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 23), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 208); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 18) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 24), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 18), vdupq_n_u64((1ULL << 34) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 25), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 224); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 22) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 26), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 22), vdupq_n_u64((1ULL << 34) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 27), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 240); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 26) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 28), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 26), vdupq_n_u64((1ULL << 34) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 29), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 256); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 30) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 30), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 30), vdupq_n_u64((1ULL << 34) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 31), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 272); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 34) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 32), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 34), vdupq_n_u64((1ULL << 30) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 288); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,30), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 33), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 34) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 34), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 38), vdupq_n_u64((1ULL << 26) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 304); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,26), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 35), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 34) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 36), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 42), vdupq_n_u64((1ULL << 22) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 320); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 37), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 34) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 38), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 46), vdupq_n_u64((1ULL << 18) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 336); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 39), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 34) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 40), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 50), vdupq_n_u64((1ULL << 14) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 352); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 20) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 41), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 34) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 42), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 54), vdupq_n_u64((1ULL << 10) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 368); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 43), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 34) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 44), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 58), vdupq_n_u64((1ULL << 6) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 384); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 28) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 45), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 34) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 46), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 62), vdupq_n_u64((1ULL << 2) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 400); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 47), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 416); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 2) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 48), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 2), vdupq_n_u64((1ULL << 34) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 49), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 28) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 432); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 6) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 50), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 6), vdupq_n_u64((1ULL << 34) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 51), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 448); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 10) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 52), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 10), vdupq_n_u64((1ULL << 34) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 53), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 20) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 464); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 14) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 54), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 14), vdupq_n_u64((1ULL << 34) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 55), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 480); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 18) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 56), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 18), vdupq_n_u64((1ULL << 34) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 57), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 496); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 22) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 58), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 22), vdupq_n_u64((1ULL << 34) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 59), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 512); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 26) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 60), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 26), vdupq_n_u64((1ULL << 34) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 61), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 528); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 30) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 62), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 30), vdupq_n_u64((1ULL << 34) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 63), tmp_dbl); + } + } + static void falp_35bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] uint64x2_t register_0; + [[maybe_unused]] uint64x2_t tmp_0; + [[maybe_unused]] int64x2_t base_0 = vmovq_n_u64(*(a_base_p)); + [[maybe_unused]] int64x2_t factor = vmovq_n_u64(alp::FACT_ARR[fac]); + [[maybe_unused]] float64x2_t frac10 = vmovq_n_f64(alp::Constants::FRAC_ARR[exp]); + [[maybe_unused]] float64x2_t tmp_dbl; + [[maybe_unused]] int64x2_t tmp_int; + for (int i = 0; i < 8; ++i) + { + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 0); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 35) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 0), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 35), vdupq_n_u64((1ULL << 29) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 16); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 6) - 1)) ,29), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 1), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 6), vdupq_n_u64((1ULL << 35) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 2), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 41), vdupq_n_u64((1ULL << 23) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 32); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,23), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 3), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 35) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 4), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 47), vdupq_n_u64((1ULL << 17) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 48); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 18) - 1)) ,17), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 5), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 18), vdupq_n_u64((1ULL << 35) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 6), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 53), vdupq_n_u64((1ULL << 11) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 64); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,11), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 7), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 35) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 8), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 59), vdupq_n_u64((1ULL << 5) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 80); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 30) - 1)) ,5), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 9), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 30), vdupq_n_u64((1ULL << 34) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 96); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 1) - 1)) ,34), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 10), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 1), vdupq_n_u64((1ULL << 35) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 11), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 28) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 112); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 7) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 12), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 7), vdupq_n_u64((1ULL << 35) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 13), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 42), vdupq_n_u64((1ULL << 22) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 128); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 13) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 14), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 13), vdupq_n_u64((1ULL << 35) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 15), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 144); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 19) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 16), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 19), vdupq_n_u64((1ULL << 35) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 17), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 54), vdupq_n_u64((1ULL << 10) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 160); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 25) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 18), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 25), vdupq_n_u64((1ULL << 35) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 19), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 176); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 31) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 20), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 31), vdupq_n_u64((1ULL << 33) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 192); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 2) - 1)) ,33), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 21), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 2), vdupq_n_u64((1ULL << 35) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 22), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 37), vdupq_n_u64((1ULL << 27) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 208); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,27), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 23), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 35) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 24), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 43), vdupq_n_u64((1ULL << 21) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 224); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 14) - 1)) ,21), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 25), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 14), vdupq_n_u64((1ULL << 35) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 26), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 49), vdupq_n_u64((1ULL << 15) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 240); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 20) - 1)) ,15), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 27), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 35) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 28), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 55), vdupq_n_u64((1ULL << 9) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 256); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 26) - 1)) ,9), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 29), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 26), vdupq_n_u64((1ULL << 35) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 30), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 61), vdupq_n_u64((1ULL << 3) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 272); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,3), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 31), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 288); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 3) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 32), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 3), vdupq_n_u64((1ULL << 35) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 33), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 38), vdupq_n_u64((1ULL << 26) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 304); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 9) - 1)) ,26), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 34), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 9), vdupq_n_u64((1ULL << 35) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 35), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 20) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 320); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 15) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 36), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 15), vdupq_n_u64((1ULL << 35) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 37), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 50), vdupq_n_u64((1ULL << 14) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 336); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 21) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 38), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 21), vdupq_n_u64((1ULL << 35) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 39), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 352); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 27) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 40), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 27), vdupq_n_u64((1ULL << 35) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 41), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 62), vdupq_n_u64((1ULL << 2) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 368); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 33) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 42), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 33), vdupq_n_u64((1ULL << 31) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 384); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,31), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 43), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 35) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 44), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 39), vdupq_n_u64((1ULL << 25) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 400); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 10) - 1)) ,25), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 45), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 10), vdupq_n_u64((1ULL << 35) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 46), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 45), vdupq_n_u64((1ULL << 19) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 416); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,19), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 47), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 35) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 48), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 51), vdupq_n_u64((1ULL << 13) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 432); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 22) - 1)) ,13), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 49), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 22), vdupq_n_u64((1ULL << 35) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 50), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 57), vdupq_n_u64((1ULL << 7) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 448); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 28) - 1)) ,7), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 51), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 35) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 52), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 63), vdupq_n_u64((1ULL << 1) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 464); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 34) - 1)) ,1), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 53), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 34), vdupq_n_u64((1ULL << 30) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 480); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 5) - 1)) ,30), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 54), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 5), vdupq_n_u64((1ULL << 35) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 55), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 496); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 11) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 56), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 11), vdupq_n_u64((1ULL << 35) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 57), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 46), vdupq_n_u64((1ULL << 18) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 512); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 17) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 58), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 17), vdupq_n_u64((1ULL << 35) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 59), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 528); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 23) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 60), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 23), vdupq_n_u64((1ULL << 35) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 61), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 58), vdupq_n_u64((1ULL << 6) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 544); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 29) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 62), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 29), vdupq_n_u64((1ULL << 35) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 63), tmp_dbl); + } + } + static void falp_36bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] uint64x2_t register_0; + [[maybe_unused]] uint64x2_t tmp_0; + [[maybe_unused]] int64x2_t base_0 = vmovq_n_u64(*(a_base_p)); + [[maybe_unused]] int64x2_t factor = vmovq_n_u64(alp::FACT_ARR[fac]); + [[maybe_unused]] float64x2_t frac10 = vmovq_n_f64(alp::Constants::FRAC_ARR[exp]); + [[maybe_unused]] float64x2_t tmp_dbl; + [[maybe_unused]] int64x2_t tmp_int; + for (int i = 0; i < 8; ++i) + { + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 0); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 36) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 0), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 28) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 16); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 1), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 36) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 2), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 20) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 32); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 3), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 36) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 4), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 48); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 5), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 36) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 6), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 64); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 7), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 80); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 8), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 36) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 9), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 96); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 10), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 36) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 11), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 112); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 20) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 12), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 36) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 13), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 128); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 28) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 14), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 36) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 15), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 144); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 36) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 16), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 28) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 160); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 17), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 36) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 18), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 20) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 176); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 19), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 36) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 20), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 192); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 21), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 36) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 22), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 208); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 23), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 224); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 24), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 36) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 25), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 240); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 26), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 36) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 27), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 256); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 20) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 28), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 36) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 29), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 272); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 28) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 30), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 36) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 31), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 288); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 36) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 32), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 28) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 304); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 33), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 36) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 34), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 20) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 320); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 35), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 36) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 36), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 336); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 37), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 36) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 38), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 352); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 39), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 368); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 40), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 36) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 41), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 384); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 42), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 36) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 43), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 400); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 20) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 44), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 36) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 45), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 416); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 28) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 46), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 36) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 47), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 432); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 36) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 48), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 28) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 448); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 49), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 36) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 50), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 20) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 464); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 51), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 36) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 52), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 480); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 53), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 36) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 54), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 496); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 55), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 512); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 56), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 36) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 57), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 528); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 58), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 36) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 59), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 544); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 20) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 60), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 36) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 61), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 560); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 28) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 62), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 36) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 63), tmp_dbl); + } + } + static void falp_37bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] uint64x2_t register_0; + [[maybe_unused]] uint64x2_t tmp_0; + [[maybe_unused]] int64x2_t base_0 = vmovq_n_u64(*(a_base_p)); + [[maybe_unused]] int64x2_t factor = vmovq_n_u64(alp::FACT_ARR[fac]); + [[maybe_unused]] float64x2_t frac10 = vmovq_n_f64(alp::Constants::FRAC_ARR[exp]); + [[maybe_unused]] float64x2_t tmp_dbl; + [[maybe_unused]] int64x2_t tmp_int; + for (int i = 0; i < 8; ++i) + { + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 0); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 37) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 0), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 37), vdupq_n_u64((1ULL << 27) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 16); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 10) - 1)) ,27), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 1), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 10), vdupq_n_u64((1ULL << 37) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 2), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 47), vdupq_n_u64((1ULL << 17) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 32); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 20) - 1)) ,17), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 3), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 37) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 4), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 57), vdupq_n_u64((1ULL << 7) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 48); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 30) - 1)) ,7), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 5), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 30), vdupq_n_u64((1ULL << 34) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 64); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 3) - 1)) ,34), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 6), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 3), vdupq_n_u64((1ULL << 37) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 7), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 80); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 13) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 8), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 13), vdupq_n_u64((1ULL << 37) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 9), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 50), vdupq_n_u64((1ULL << 14) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 96); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 23) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 10), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 23), vdupq_n_u64((1ULL << 37) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 11), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 112); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 33) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 12), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 33), vdupq_n_u64((1ULL << 31) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 128); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 6) - 1)) ,31), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 13), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 6), vdupq_n_u64((1ULL << 37) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 14), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 43), vdupq_n_u64((1ULL << 21) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 144); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,21), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 15), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 37) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 16), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 53), vdupq_n_u64((1ULL << 11) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 160); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 26) - 1)) ,11), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 17), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 26), vdupq_n_u64((1ULL << 37) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 18), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 63), vdupq_n_u64((1ULL << 1) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 176); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 36) - 1)) ,1), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 19), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 28) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 192); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 9) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 20), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 9), vdupq_n_u64((1ULL << 37) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 21), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 46), vdupq_n_u64((1ULL << 18) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 208); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 19) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 22), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 19), vdupq_n_u64((1ULL << 37) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 23), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 224); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 29) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 24), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 29), vdupq_n_u64((1ULL << 35) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 240); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 2) - 1)) ,35), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 25), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 2), vdupq_n_u64((1ULL << 37) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 26), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 39), vdupq_n_u64((1ULL << 25) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 256); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,25), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 27), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 37) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 28), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 49), vdupq_n_u64((1ULL << 15) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 272); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 22) - 1)) ,15), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 29), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 22), vdupq_n_u64((1ULL << 37) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 30), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 59), vdupq_n_u64((1ULL << 5) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 288); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,5), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 31), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 304); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 5) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 32), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 5), vdupq_n_u64((1ULL << 37) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 33), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 42), vdupq_n_u64((1ULL << 22) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 320); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 15) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 34), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 15), vdupq_n_u64((1ULL << 37) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 35), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 336); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 25) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 36), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 25), vdupq_n_u64((1ULL << 37) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 37), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 62), vdupq_n_u64((1ULL << 2) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 352); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 35) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 38), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 35), vdupq_n_u64((1ULL << 29) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 368); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,29), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 39), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 37) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 40), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 45), vdupq_n_u64((1ULL << 19) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 384); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 18) - 1)) ,19), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 41), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 18), vdupq_n_u64((1ULL << 37) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 42), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 55), vdupq_n_u64((1ULL << 9) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 400); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 28) - 1)) ,9), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 43), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 36) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 416); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 1) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 44), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 1), vdupq_n_u64((1ULL << 37) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 45), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 38), vdupq_n_u64((1ULL << 26) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 432); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 11) - 1)) ,26), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 46), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 11), vdupq_n_u64((1ULL << 37) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 47), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 448); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 21) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 48), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 21), vdupq_n_u64((1ULL << 37) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 49), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 58), vdupq_n_u64((1ULL << 6) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 464); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 31) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 50), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 31), vdupq_n_u64((1ULL << 33) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 480); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,33), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 51), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 37) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 52), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 41), vdupq_n_u64((1ULL << 23) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 496); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 14) - 1)) ,23), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 53), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 14), vdupq_n_u64((1ULL << 37) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 54), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 51), vdupq_n_u64((1ULL << 13) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 512); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,13), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 55), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 37) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 56), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 61), vdupq_n_u64((1ULL << 3) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 528); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 34) - 1)) ,3), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 57), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 34), vdupq_n_u64((1ULL << 30) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 544); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 7) - 1)) ,30), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 58), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 7), vdupq_n_u64((1ULL << 37) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 59), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 20) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 560); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 17) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 60), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 17), vdupq_n_u64((1ULL << 37) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 61), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 54), vdupq_n_u64((1ULL << 10) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 576); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 27) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 62), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 27), vdupq_n_u64((1ULL << 37) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 63), tmp_dbl); + } + } + static void falp_38bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] uint64x2_t register_0; + [[maybe_unused]] uint64x2_t tmp_0; + [[maybe_unused]] int64x2_t base_0 = vmovq_n_u64(*(a_base_p)); + [[maybe_unused]] int64x2_t factor = vmovq_n_u64(alp::FACT_ARR[fac]); + [[maybe_unused]] float64x2_t frac10 = vmovq_n_f64(alp::Constants::FRAC_ARR[exp]); + [[maybe_unused]] float64x2_t tmp_dbl; + [[maybe_unused]] int64x2_t tmp_int; + for (int i = 0; i < 8; ++i) + { + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 0); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 38) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 0), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 38), vdupq_n_u64((1ULL << 26) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 16); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,26), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 1), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 38) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 2), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 50), vdupq_n_u64((1ULL << 14) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 32); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 3), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 38) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 4), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 62), vdupq_n_u64((1ULL << 2) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 48); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 36) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 5), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 28) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 64); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 10) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 6), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 10), vdupq_n_u64((1ULL << 38) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 7), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 80); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 22) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 8), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 22), vdupq_n_u64((1ULL << 38) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 9), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 96); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 34) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 10), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 34), vdupq_n_u64((1ULL << 30) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 112); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,30), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 11), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 38) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 12), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 46), vdupq_n_u64((1ULL << 18) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 128); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 20) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 13), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 38) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 14), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 58), vdupq_n_u64((1ULL << 6) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 144); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 15), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 160); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 6) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 16), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 6), vdupq_n_u64((1ULL << 38) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 17), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 20) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 176); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 18) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 18), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 18), vdupq_n_u64((1ULL << 38) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 19), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 192); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 30) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 20), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 30), vdupq_n_u64((1ULL << 34) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 208); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,34), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 21), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 38) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 22), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 42), vdupq_n_u64((1ULL << 22) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 224); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 23), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 38) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 24), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 54), vdupq_n_u64((1ULL << 10) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 240); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 28) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 25), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 36) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 256); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 2) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 26), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 2), vdupq_n_u64((1ULL << 38) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 27), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 272); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 14) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 28), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 14), vdupq_n_u64((1ULL << 38) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 29), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 288); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 26) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 30), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 26), vdupq_n_u64((1ULL << 38) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 31), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 304); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 38) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 32), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 38), vdupq_n_u64((1ULL << 26) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 320); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,26), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 33), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 38) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 34), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 50), vdupq_n_u64((1ULL << 14) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 336); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 35), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 38) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 36), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 62), vdupq_n_u64((1ULL << 2) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 352); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 36) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 37), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 28) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 368); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 10) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 38), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 10), vdupq_n_u64((1ULL << 38) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 39), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 384); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 22) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 40), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 22), vdupq_n_u64((1ULL << 38) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 41), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 400); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 34) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 42), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 34), vdupq_n_u64((1ULL << 30) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 416); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,30), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 43), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 38) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 44), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 46), vdupq_n_u64((1ULL << 18) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 432); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 20) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 45), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 38) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 46), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 58), vdupq_n_u64((1ULL << 6) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 448); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 47), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 464); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 6) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 48), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 6), vdupq_n_u64((1ULL << 38) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 49), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 20) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 480); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 18) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 50), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 18), vdupq_n_u64((1ULL << 38) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 51), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 496); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 30) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 52), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 30), vdupq_n_u64((1ULL << 34) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 512); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,34), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 53), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 38) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 54), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 42), vdupq_n_u64((1ULL << 22) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 528); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 55), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 38) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 56), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 54), vdupq_n_u64((1ULL << 10) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 544); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 28) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 57), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 36) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 560); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 2) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 58), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 2), vdupq_n_u64((1ULL << 38) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 59), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 576); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 14) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 60), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 14), vdupq_n_u64((1ULL << 38) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 61), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 592); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 26) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 62), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 26), vdupq_n_u64((1ULL << 38) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 63), tmp_dbl); + } + } + static void falp_39bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] uint64x2_t register_0; + [[maybe_unused]] uint64x2_t tmp_0; + [[maybe_unused]] int64x2_t base_0 = vmovq_n_u64(*(a_base_p)); + [[maybe_unused]] int64x2_t factor = vmovq_n_u64(alp::FACT_ARR[fac]); + [[maybe_unused]] float64x2_t frac10 = vmovq_n_f64(alp::Constants::FRAC_ARR[exp]); + [[maybe_unused]] float64x2_t tmp_dbl; + [[maybe_unused]] int64x2_t tmp_int; + for (int i = 0; i < 8; ++i) + { + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 0); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 39) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 0), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 39), vdupq_n_u64((1ULL << 25) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 16); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 14) - 1)) ,25), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 1), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 14), vdupq_n_u64((1ULL << 39) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 2), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 53), vdupq_n_u64((1ULL << 11) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 32); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 28) - 1)) ,11), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 3), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 36) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 48); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 3) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 4), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 3), vdupq_n_u64((1ULL << 39) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 5), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 42), vdupq_n_u64((1ULL << 22) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 64); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 17) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 6), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 17), vdupq_n_u64((1ULL << 39) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 7), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 80); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 31) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 8), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 31), vdupq_n_u64((1ULL << 33) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 96); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 6) - 1)) ,33), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 9), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 6), vdupq_n_u64((1ULL << 39) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 10), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 45), vdupq_n_u64((1ULL << 19) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 112); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 20) - 1)) ,19), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 11), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 39) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 12), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 59), vdupq_n_u64((1ULL << 5) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 128); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 34) - 1)) ,5), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 13), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 34), vdupq_n_u64((1ULL << 30) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 144); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 9) - 1)) ,30), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 14), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 9), vdupq_n_u64((1ULL << 39) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 15), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 160); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 23) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 16), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 23), vdupq_n_u64((1ULL << 39) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 17), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 62), vdupq_n_u64((1ULL << 2) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 176); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 37) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 18), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 37), vdupq_n_u64((1ULL << 27) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 192); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,27), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 19), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 39) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 20), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 51), vdupq_n_u64((1ULL << 13) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 208); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 26) - 1)) ,13), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 21), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 26), vdupq_n_u64((1ULL << 38) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 224); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 1) - 1)) ,38), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 22), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 1), vdupq_n_u64((1ULL << 39) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 23), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 240); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 15) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 24), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 15), vdupq_n_u64((1ULL << 39) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 25), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 54), vdupq_n_u64((1ULL << 10) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 256); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 29) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 26), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 29), vdupq_n_u64((1ULL << 35) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 272); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,35), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 27), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 39) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 28), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 43), vdupq_n_u64((1ULL << 21) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 288); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 18) - 1)) ,21), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 29), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 18), vdupq_n_u64((1ULL << 39) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 30), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 57), vdupq_n_u64((1ULL << 7) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 304); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,7), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 31), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 320); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 7) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 32), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 7), vdupq_n_u64((1ULL << 39) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 33), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 46), vdupq_n_u64((1ULL << 18) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 336); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 21) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 34), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 21), vdupq_n_u64((1ULL << 39) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 35), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 352); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 35) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 36), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 35), vdupq_n_u64((1ULL << 29) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 368); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 10) - 1)) ,29), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 37), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 10), vdupq_n_u64((1ULL << 39) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 38), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 49), vdupq_n_u64((1ULL << 15) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 384); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,15), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 39), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 39) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 40), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 63), vdupq_n_u64((1ULL << 1) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 400); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 38) - 1)) ,1), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 41), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 38), vdupq_n_u64((1ULL << 26) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 416); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 13) - 1)) ,26), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 42), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 13), vdupq_n_u64((1ULL << 39) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 43), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 432); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 27) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 44), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 27), vdupq_n_u64((1ULL << 37) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 448); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 2) - 1)) ,37), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 45), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 2), vdupq_n_u64((1ULL << 39) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 46), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 41), vdupq_n_u64((1ULL << 23) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 464); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,23), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 47), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 39) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 48), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 55), vdupq_n_u64((1ULL << 9) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 480); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 30) - 1)) ,9), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 49), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 30), vdupq_n_u64((1ULL << 34) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 496); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 5) - 1)) ,34), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 50), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 5), vdupq_n_u64((1ULL << 39) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 51), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 20) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 512); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 19) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 52), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 19), vdupq_n_u64((1ULL << 39) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 53), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 58), vdupq_n_u64((1ULL << 6) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 528); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 33) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 54), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 33), vdupq_n_u64((1ULL << 31) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 544); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,31), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 55), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 39) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 56), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 47), vdupq_n_u64((1ULL << 17) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 560); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 22) - 1)) ,17), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 57), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 22), vdupq_n_u64((1ULL << 39) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 58), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 61), vdupq_n_u64((1ULL << 3) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 576); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 36) - 1)) ,3), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 59), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 28) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 592); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 11) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 60), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 11), vdupq_n_u64((1ULL << 39) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 61), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 50), vdupq_n_u64((1ULL << 14) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 608); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 25) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 62), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 25), vdupq_n_u64((1ULL << 39) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 63), tmp_dbl); + } + } + static void falp_40bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] uint64x2_t register_0; + [[maybe_unused]] uint64x2_t tmp_0; + [[maybe_unused]] int64x2_t base_0 = vmovq_n_u64(*(a_base_p)); + [[maybe_unused]] int64x2_t factor = vmovq_n_u64(alp::FACT_ARR[fac]); + [[maybe_unused]] float64x2_t frac10 = vmovq_n_f64(alp::Constants::FRAC_ARR[exp]); + [[maybe_unused]] float64x2_t tmp_dbl; + [[maybe_unused]] int64x2_t tmp_int; + for (int i = 0; i < 8; ++i) + { + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 0); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 40) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 0), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 16); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 1), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 40) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 2), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 32); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 3), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 48); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 4), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 40) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 5), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 64); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 6), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 40) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 7), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 80); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 40) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 8), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 96); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 9), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 40) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 10), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 112); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 11), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 128); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 12), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 40) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 13), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 144); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 14), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 40) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 15), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 160); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 40) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 16), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 176); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 17), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 40) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 18), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 192); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 19), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 208); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 20), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 40) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 21), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 224); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 22), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 40) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 23), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 240); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 40) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 24), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 256); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 25), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 40) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 26), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 272); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 27), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 288); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 28), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 40) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 29), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 304); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 30), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 40) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 31), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 320); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 40) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 32), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 336); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 33), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 40) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 34), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 352); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 35), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 368); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 36), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 40) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 37), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 384); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 38), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 40) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 39), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 400); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 40) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 40), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 416); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 41), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 40) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 42), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 432); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 43), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 448); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 44), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 40) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 45), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 464); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 46), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 40) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 47), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 480); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 40) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 48), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 496); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 49), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 40) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 50), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 512); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 51), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 528); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 52), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 40) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 53), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 544); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 54), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 40) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 55), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 560); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 40) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 56), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 576); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 57), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 40) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 58), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 592); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 59), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 608); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 60), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 40) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 61), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 624); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 62), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 40) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 63), tmp_dbl); + } + } + static void falp_41bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] uint64x2_t register_0; + [[maybe_unused]] uint64x2_t tmp_0; + [[maybe_unused]] int64x2_t base_0 = vmovq_n_u64(*(a_base_p)); + [[maybe_unused]] int64x2_t factor = vmovq_n_u64(alp::FACT_ARR[fac]); + [[maybe_unused]] float64x2_t frac10 = vmovq_n_f64(alp::Constants::FRAC_ARR[exp]); + [[maybe_unused]] float64x2_t tmp_dbl; + [[maybe_unused]] int64x2_t tmp_int; + for (int i = 0; i < 8; ++i) + { + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 0); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 41) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 0), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 41), vdupq_n_u64((1ULL << 23) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 16); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 18) - 1)) ,23), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 1), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 18), vdupq_n_u64((1ULL << 41) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 2), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 59), vdupq_n_u64((1ULL << 5) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 32); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 36) - 1)) ,5), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 3), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 28) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 48); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 13) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 4), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 13), vdupq_n_u64((1ULL << 41) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 5), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 54), vdupq_n_u64((1ULL << 10) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 64); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 31) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 6), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 31), vdupq_n_u64((1ULL << 33) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 80); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,33), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 7), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 41) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 8), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 49), vdupq_n_u64((1ULL << 15) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 96); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 26) - 1)) ,15), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 9), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 26), vdupq_n_u64((1ULL << 38) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 112); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 3) - 1)) ,38), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 10), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 3), vdupq_n_u64((1ULL << 41) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 11), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 20) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 128); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 21) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 12), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 21), vdupq_n_u64((1ULL << 41) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 13), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 62), vdupq_n_u64((1ULL << 2) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 144); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 39) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 14), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 39), vdupq_n_u64((1ULL << 25) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 160); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,25), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 15), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 41) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 16), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 57), vdupq_n_u64((1ULL << 7) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 176); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 34) - 1)) ,7), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 17), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 34), vdupq_n_u64((1ULL << 30) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 192); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 11) - 1)) ,30), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 18), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 11), vdupq_n_u64((1ULL << 41) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 19), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 208); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 29) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 20), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 29), vdupq_n_u64((1ULL << 35) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 224); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 6) - 1)) ,35), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 21), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 6), vdupq_n_u64((1ULL << 41) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 22), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 47), vdupq_n_u64((1ULL << 17) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 240); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,17), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 23), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 40) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 256); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 1) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 24), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 1), vdupq_n_u64((1ULL << 41) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 25), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 42), vdupq_n_u64((1ULL << 22) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 272); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 19) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 26), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 19), vdupq_n_u64((1ULL << 41) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 27), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 288); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 37) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 28), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 37), vdupq_n_u64((1ULL << 27) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 304); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 14) - 1)) ,27), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 29), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 14), vdupq_n_u64((1ULL << 41) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 30), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 55), vdupq_n_u64((1ULL << 9) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 320); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,9), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 31), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 336); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 9) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 32), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 9), vdupq_n_u64((1ULL << 41) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 33), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 50), vdupq_n_u64((1ULL << 14) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 352); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 27) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 34), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 27), vdupq_n_u64((1ULL << 37) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 368); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,37), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 35), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 41) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 36), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 45), vdupq_n_u64((1ULL << 19) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 384); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 22) - 1)) ,19), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 37), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 22), vdupq_n_u64((1ULL << 41) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 38), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 63), vdupq_n_u64((1ULL << 1) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 400); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 40) - 1)) ,1), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 39), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 416); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 17) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 40), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 17), vdupq_n_u64((1ULL << 41) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 41), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 58), vdupq_n_u64((1ULL << 6) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 432); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 35) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 42), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 35), vdupq_n_u64((1ULL << 29) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 448); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,29), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 43), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 41) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 44), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 53), vdupq_n_u64((1ULL << 11) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 464); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 30) - 1)) ,11), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 45), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 30), vdupq_n_u64((1ULL << 34) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 480); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 7) - 1)) ,34), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 46), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 7), vdupq_n_u64((1ULL << 41) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 47), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 496); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 25) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 48), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 25), vdupq_n_u64((1ULL << 39) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 512); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 2) - 1)) ,39), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 49), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 2), vdupq_n_u64((1ULL << 41) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 50), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 43), vdupq_n_u64((1ULL << 21) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 528); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 20) - 1)) ,21), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 51), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 41) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 52), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 61), vdupq_n_u64((1ULL << 3) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 544); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 38) - 1)) ,3), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 53), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 38), vdupq_n_u64((1ULL << 26) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 560); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 15) - 1)) ,26), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 54), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 15), vdupq_n_u64((1ULL << 41) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 55), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 576); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 33) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 56), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 33), vdupq_n_u64((1ULL << 31) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 592); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 10) - 1)) ,31), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 57), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 10), vdupq_n_u64((1ULL << 41) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 58), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 51), vdupq_n_u64((1ULL << 13) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 608); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 28) - 1)) ,13), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 59), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 36) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 624); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 5) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 60), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 5), vdupq_n_u64((1ULL << 41) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 61), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 46), vdupq_n_u64((1ULL << 18) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 640); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 23) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 62), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 23), vdupq_n_u64((1ULL << 41) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 63), tmp_dbl); + } + } + static void falp_42bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] uint64x2_t register_0; + [[maybe_unused]] uint64x2_t tmp_0; + [[maybe_unused]] int64x2_t base_0 = vmovq_n_u64(*(a_base_p)); + [[maybe_unused]] int64x2_t factor = vmovq_n_u64(alp::FACT_ARR[fac]); + [[maybe_unused]] float64x2_t frac10 = vmovq_n_f64(alp::Constants::FRAC_ARR[exp]); + [[maybe_unused]] float64x2_t tmp_dbl; + [[maybe_unused]] int64x2_t tmp_int; + for (int i = 0; i < 8; ++i) + { + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 0); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 42) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 0), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 42), vdupq_n_u64((1ULL << 22) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 16); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 20) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 1), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 42) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 2), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 62), vdupq_n_u64((1ULL << 2) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 32); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 40) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 3), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 48); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 18) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 4), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 18), vdupq_n_u64((1ULL << 42) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 5), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 64); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 38) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 6), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 38), vdupq_n_u64((1ULL << 26) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 80); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,26), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 7), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 42) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 8), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 58), vdupq_n_u64((1ULL << 6) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 96); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 36) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 9), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 28) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 112); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 14) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 10), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 14), vdupq_n_u64((1ULL << 42) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 11), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 128); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 34) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 12), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 34), vdupq_n_u64((1ULL << 30) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 144); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,30), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 13), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 42) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 14), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 54), vdupq_n_u64((1ULL << 10) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 160); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 15), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 176); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 10) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 16), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 10), vdupq_n_u64((1ULL << 42) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 17), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 192); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 30) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 18), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 30), vdupq_n_u64((1ULL << 34) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 208); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,34), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 19), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 42) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 20), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 50), vdupq_n_u64((1ULL << 14) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 224); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 28) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 21), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 36) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 240); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 6) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 22), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 6), vdupq_n_u64((1ULL << 42) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 23), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 256); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 26) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 24), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 26), vdupq_n_u64((1ULL << 38) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 272); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,38), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 25), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 42) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 26), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 46), vdupq_n_u64((1ULL << 18) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 288); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 27), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 40) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 304); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 2) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 28), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 2), vdupq_n_u64((1ULL << 42) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 29), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 20) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 320); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 22) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 30), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 22), vdupq_n_u64((1ULL << 42) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 31), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 336); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 42) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 32), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 42), vdupq_n_u64((1ULL << 22) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 352); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 20) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 33), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 42) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 34), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 62), vdupq_n_u64((1ULL << 2) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 368); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 40) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 35), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 384); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 18) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 36), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 18), vdupq_n_u64((1ULL << 42) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 37), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 400); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 38) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 38), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 38), vdupq_n_u64((1ULL << 26) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 416); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,26), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 39), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 42) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 40), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 58), vdupq_n_u64((1ULL << 6) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 432); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 36) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 41), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 28) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 448); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 14) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 42), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 14), vdupq_n_u64((1ULL << 42) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 43), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 464); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 34) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 44), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 34), vdupq_n_u64((1ULL << 30) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 480); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,30), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 45), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 42) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 46), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 54), vdupq_n_u64((1ULL << 10) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 496); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 47), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 512); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 10) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 48), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 10), vdupq_n_u64((1ULL << 42) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 49), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 528); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 30) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 50), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 30), vdupq_n_u64((1ULL << 34) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 544); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,34), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 51), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 42) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 52), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 50), vdupq_n_u64((1ULL << 14) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 560); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 28) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 53), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 36) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 576); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 6) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 54), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 6), vdupq_n_u64((1ULL << 42) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 55), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 592); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 26) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 56), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 26), vdupq_n_u64((1ULL << 38) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 608); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,38), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 57), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 42) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 58), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 46), vdupq_n_u64((1ULL << 18) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 624); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 59), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 40) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 640); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 2) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 60), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 2), vdupq_n_u64((1ULL << 42) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 61), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 20) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 656); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 22) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 62), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 22), vdupq_n_u64((1ULL << 42) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 63), tmp_dbl); + } + } + static void falp_43bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] uint64x2_t register_0; + [[maybe_unused]] uint64x2_t tmp_0; + [[maybe_unused]] int64x2_t base_0 = vmovq_n_u64(*(a_base_p)); + [[maybe_unused]] int64x2_t factor = vmovq_n_u64(alp::FACT_ARR[fac]); + [[maybe_unused]] float64x2_t frac10 = vmovq_n_f64(alp::Constants::FRAC_ARR[exp]); + [[maybe_unused]] float64x2_t tmp_dbl; + [[maybe_unused]] int64x2_t tmp_int; + for (int i = 0; i < 8; ++i) + { + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 0); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 43) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 0), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 43), vdupq_n_u64((1ULL << 21) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 16); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 22) - 1)) ,21), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 1), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 22), vdupq_n_u64((1ULL << 42) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 32); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 1) - 1)) ,42), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 2), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 1), vdupq_n_u64((1ULL << 43) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 3), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 20) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 48); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 23) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 4), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 23), vdupq_n_u64((1ULL << 41) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 64); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 2) - 1)) ,41), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 5), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 2), vdupq_n_u64((1ULL << 43) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 6), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 45), vdupq_n_u64((1ULL << 19) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 80); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,19), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 7), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 40) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 96); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 3) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 8), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 3), vdupq_n_u64((1ULL << 43) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 9), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 46), vdupq_n_u64((1ULL << 18) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 112); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 25) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 10), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 25), vdupq_n_u64((1ULL << 39) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 128); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,39), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 11), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 43) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 12), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 47), vdupq_n_u64((1ULL << 17) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 144); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 26) - 1)) ,17), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 13), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 26), vdupq_n_u64((1ULL << 38) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 160); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 5) - 1)) ,38), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 14), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 5), vdupq_n_u64((1ULL << 43) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 15), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 176); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 27) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 16), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 27), vdupq_n_u64((1ULL << 37) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 192); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 6) - 1)) ,37), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 17), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 6), vdupq_n_u64((1ULL << 43) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 18), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 49), vdupq_n_u64((1ULL << 15) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 208); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 28) - 1)) ,15), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 19), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 36) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 224); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 7) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 20), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 7), vdupq_n_u64((1ULL << 43) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 21), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 50), vdupq_n_u64((1ULL << 14) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 240); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 29) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 22), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 29), vdupq_n_u64((1ULL << 35) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 256); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,35), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 23), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 43) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 24), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 51), vdupq_n_u64((1ULL << 13) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 272); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 30) - 1)) ,13), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 25), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 30), vdupq_n_u64((1ULL << 34) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 288); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 9) - 1)) ,34), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 26), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 9), vdupq_n_u64((1ULL << 43) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 27), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 304); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 31) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 28), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 31), vdupq_n_u64((1ULL << 33) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 320); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 10) - 1)) ,33), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 29), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 10), vdupq_n_u64((1ULL << 43) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 30), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 53), vdupq_n_u64((1ULL << 11) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 336); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,11), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 31), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 352); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 11) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 32), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 11), vdupq_n_u64((1ULL << 43) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 33), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 54), vdupq_n_u64((1ULL << 10) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 368); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 33) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 34), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 33), vdupq_n_u64((1ULL << 31) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 384); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,31), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 35), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 43) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 36), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 55), vdupq_n_u64((1ULL << 9) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 400); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 34) - 1)) ,9), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 37), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 34), vdupq_n_u64((1ULL << 30) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 416); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 13) - 1)) ,30), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 38), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 13), vdupq_n_u64((1ULL << 43) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 39), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 432); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 35) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 40), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 35), vdupq_n_u64((1ULL << 29) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 448); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 14) - 1)) ,29), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 41), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 14), vdupq_n_u64((1ULL << 43) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 42), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 57), vdupq_n_u64((1ULL << 7) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 464); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 36) - 1)) ,7), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 43), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 28) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 480); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 15) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 44), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 15), vdupq_n_u64((1ULL << 43) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 45), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 58), vdupq_n_u64((1ULL << 6) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 496); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 37) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 46), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 37), vdupq_n_u64((1ULL << 27) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 512); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,27), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 47), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 43) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 48), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 59), vdupq_n_u64((1ULL << 5) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 528); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 38) - 1)) ,5), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 49), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 38), vdupq_n_u64((1ULL << 26) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 544); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 17) - 1)) ,26), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 50), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 17), vdupq_n_u64((1ULL << 43) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 51), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 560); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 39) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 52), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 39), vdupq_n_u64((1ULL << 25) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 576); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 18) - 1)) ,25), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 53), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 18), vdupq_n_u64((1ULL << 43) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 54), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 61), vdupq_n_u64((1ULL << 3) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 592); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 40) - 1)) ,3), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 55), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 608); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 19) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 56), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 19), vdupq_n_u64((1ULL << 43) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 57), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 62), vdupq_n_u64((1ULL << 2) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 624); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 41) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 58), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 41), vdupq_n_u64((1ULL << 23) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 640); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 20) - 1)) ,23), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 59), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 43) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 60), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 63), vdupq_n_u64((1ULL << 1) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 656); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 42) - 1)) ,1), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 61), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 42), vdupq_n_u64((1ULL << 22) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 672); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 21) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 62), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 21), vdupq_n_u64((1ULL << 43) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 63), tmp_dbl); + } + } + static void falp_44bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] uint64x2_t register_0; + [[maybe_unused]] uint64x2_t tmp_0; + [[maybe_unused]] int64x2_t base_0 = vmovq_n_u64(*(a_base_p)); + [[maybe_unused]] int64x2_t factor = vmovq_n_u64(alp::FACT_ARR[fac]); + [[maybe_unused]] float64x2_t frac10 = vmovq_n_f64(alp::Constants::FRAC_ARR[exp]); + [[maybe_unused]] float64x2_t tmp_dbl; + [[maybe_unused]] int64x2_t tmp_int; + for (int i = 0; i < 8; ++i) + { + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 0); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 44) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 0), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 20) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 16); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 1), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 40) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 32); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 2), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 44) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 3), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 48); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 28) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 4), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 36) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 64); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 5), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 44) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 6), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 80); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 7), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 96); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 8), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 44) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 9), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 112); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 36) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 10), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 28) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 128); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 11), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 44) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 12), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 144); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 40) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 13), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 160); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 20) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 14), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 44) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 15), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 176); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 44) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 16), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 20) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 192); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 17), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 40) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 208); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 18), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 44) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 19), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 224); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 28) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 20), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 36) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 240); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 21), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 44) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 22), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 256); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 23), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 272); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 24), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 44) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 25), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 288); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 36) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 26), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 28) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 304); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 27), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 44) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 28), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 320); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 40) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 29), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 336); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 20) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 30), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 44) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 31), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 352); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 44) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 32), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 20) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 368); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 33), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 40) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 384); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 34), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 44) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 35), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 400); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 28) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 36), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 36) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 416); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 37), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 44) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 38), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 432); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 39), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 448); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 40), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 44) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 41), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 464); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 36) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 42), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 28) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 480); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 43), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 44) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 44), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 496); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 40) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 45), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 512); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 20) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 46), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 44) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 47), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 528); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 44) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 48), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 20) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 544); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 49), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 40) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 560); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 50), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 44) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 51), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 576); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 28) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 52), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 36) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 592); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 53), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 44) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 54), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 608); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 55), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 624); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 56), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 44) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 57), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 640); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 36) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 58), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 28) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 656); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 59), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 44) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 60), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 672); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 40) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 61), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 688); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 20) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 62), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 44) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 63), tmp_dbl); + } + } + static void falp_45bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] uint64x2_t register_0; + [[maybe_unused]] uint64x2_t tmp_0; + [[maybe_unused]] int64x2_t base_0 = vmovq_n_u64(*(a_base_p)); + [[maybe_unused]] int64x2_t factor = vmovq_n_u64(alp::FACT_ARR[fac]); + [[maybe_unused]] float64x2_t frac10 = vmovq_n_f64(alp::Constants::FRAC_ARR[exp]); + [[maybe_unused]] float64x2_t tmp_dbl; + [[maybe_unused]] int64x2_t tmp_int; + for (int i = 0; i < 8; ++i) + { + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 0); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 45) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 0), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 45), vdupq_n_u64((1ULL << 19) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 16); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 26) - 1)) ,19), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 1), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 26), vdupq_n_u64((1ULL << 38) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 32); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 7) - 1)) ,38), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 2), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 7), vdupq_n_u64((1ULL << 45) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 3), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 48); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 33) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 4), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 33), vdupq_n_u64((1ULL << 31) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 64); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 14) - 1)) ,31), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 5), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 14), vdupq_n_u64((1ULL << 45) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 6), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 59), vdupq_n_u64((1ULL << 5) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 80); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 40) - 1)) ,5), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 7), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 96); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 21) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 8), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 21), vdupq_n_u64((1ULL << 43) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 112); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 2) - 1)) ,43), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 9), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 2), vdupq_n_u64((1ULL << 45) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 10), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 47), vdupq_n_u64((1ULL << 17) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 128); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 28) - 1)) ,17), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 11), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 36) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 144); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 9) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 12), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 9), vdupq_n_u64((1ULL << 45) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 13), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 54), vdupq_n_u64((1ULL << 10) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 160); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 35) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 14), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 35), vdupq_n_u64((1ULL << 29) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 176); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,29), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 15), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 45) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 16), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 61), vdupq_n_u64((1ULL << 3) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 192); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 42) - 1)) ,3), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 17), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 42), vdupq_n_u64((1ULL << 22) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 208); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 23) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 18), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 23), vdupq_n_u64((1ULL << 41) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 224); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,41), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 19), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 45) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 20), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 49), vdupq_n_u64((1ULL << 15) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 240); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 30) - 1)) ,15), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 21), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 30), vdupq_n_u64((1ULL << 34) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 256); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 11) - 1)) ,34), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 22), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 11), vdupq_n_u64((1ULL << 45) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 23), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 272); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 37) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 24), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 37), vdupq_n_u64((1ULL << 27) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 288); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 18) - 1)) ,27), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 25), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 18), vdupq_n_u64((1ULL << 45) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 26), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 63), vdupq_n_u64((1ULL << 1) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 304); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 44) - 1)) ,1), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 27), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 20) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 320); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 25) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 28), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 25), vdupq_n_u64((1ULL << 39) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 336); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 6) - 1)) ,39), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 29), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 6), vdupq_n_u64((1ULL << 45) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 30), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 51), vdupq_n_u64((1ULL << 13) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 352); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,13), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 31), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 368); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 13) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 32), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 13), vdupq_n_u64((1ULL << 45) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 33), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 58), vdupq_n_u64((1ULL << 6) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 384); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 39) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 34), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 39), vdupq_n_u64((1ULL << 25) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 400); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 20) - 1)) ,25), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 35), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 44) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 416); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 1) - 1)) ,44), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 36), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 1), vdupq_n_u64((1ULL << 45) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 37), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 46), vdupq_n_u64((1ULL << 18) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 432); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 27) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 38), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 27), vdupq_n_u64((1ULL << 37) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 448); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,37), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 39), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 45) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 40), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 53), vdupq_n_u64((1ULL << 11) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 464); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 34) - 1)) ,11), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 41), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 34), vdupq_n_u64((1ULL << 30) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 480); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 15) - 1)) ,30), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 42), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 15), vdupq_n_u64((1ULL << 45) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 43), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 496); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 41) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 44), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 41), vdupq_n_u64((1ULL << 23) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 512); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 22) - 1)) ,23), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 45), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 22), vdupq_n_u64((1ULL << 42) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 528); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 3) - 1)) ,42), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 46), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 3), vdupq_n_u64((1ULL << 45) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 47), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 544); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 29) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 48), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 29), vdupq_n_u64((1ULL << 35) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 560); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 10) - 1)) ,35), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 49), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 10), vdupq_n_u64((1ULL << 45) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 50), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 55), vdupq_n_u64((1ULL << 9) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 576); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 36) - 1)) ,9), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 51), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 28) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 592); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 17) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 52), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 17), vdupq_n_u64((1ULL << 45) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 53), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 62), vdupq_n_u64((1ULL << 2) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 608); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 43) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 54), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 43), vdupq_n_u64((1ULL << 21) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 624); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,21), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 55), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 40) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 640); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 5) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 56), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 5), vdupq_n_u64((1ULL << 45) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 57), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 50), vdupq_n_u64((1ULL << 14) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 656); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 31) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 58), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 31), vdupq_n_u64((1ULL << 33) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 672); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,33), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 59), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 45) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 60), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 57), vdupq_n_u64((1ULL << 7) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 688); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 38) - 1)) ,7), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 61), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 38), vdupq_n_u64((1ULL << 26) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 704); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 19) - 1)) ,26), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 62), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 19), vdupq_n_u64((1ULL << 45) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 63), tmp_dbl); + } + } + static void falp_46bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] uint64x2_t register_0; + [[maybe_unused]] uint64x2_t tmp_0; + [[maybe_unused]] int64x2_t base_0 = vmovq_n_u64(*(a_base_p)); + [[maybe_unused]] int64x2_t factor = vmovq_n_u64(alp::FACT_ARR[fac]); + [[maybe_unused]] float64x2_t frac10 = vmovq_n_f64(alp::Constants::FRAC_ARR[exp]); + [[maybe_unused]] float64x2_t tmp_dbl; + [[maybe_unused]] int64x2_t tmp_int; + for (int i = 0; i < 8; ++i) + { + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 0); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 46) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 0), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 46), vdupq_n_u64((1ULL << 18) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 16); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 28) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 1), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 36) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 32); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 10) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 2), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 10), vdupq_n_u64((1ULL << 46) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 3), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 48); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 38) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 4), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 38), vdupq_n_u64((1ULL << 26) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 64); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 20) - 1)) ,26), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 5), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 44) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 80); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 2) - 1)) ,44), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 6), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 2), vdupq_n_u64((1ULL << 46) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 7), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 96); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 30) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 8), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 30), vdupq_n_u64((1ULL << 34) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 112); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,34), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 9), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 46) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 10), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 58), vdupq_n_u64((1ULL << 6) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 128); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 40) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 11), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 144); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 22) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 12), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 22), vdupq_n_u64((1ULL << 42) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 160); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,42), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 13), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 46) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 14), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 50), vdupq_n_u64((1ULL << 14) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 176); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 15), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 192); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 14) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 16), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 14), vdupq_n_u64((1ULL << 46) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 17), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 208); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 42) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 18), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 42), vdupq_n_u64((1ULL << 22) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 224); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 19), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 40) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 240); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 6) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 20), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 6), vdupq_n_u64((1ULL << 46) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 21), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 256); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 34) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 22), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 34), vdupq_n_u64((1ULL << 30) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 272); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,30), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 23), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 46) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 24), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 62), vdupq_n_u64((1ULL << 2) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 288); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 44) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 25), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 20) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 304); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 26) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 26), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 26), vdupq_n_u64((1ULL << 38) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 320); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,38), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 27), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 46) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 28), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 54), vdupq_n_u64((1ULL << 10) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 336); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 36) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 29), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 28) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 352); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 18) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 30), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 18), vdupq_n_u64((1ULL << 46) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 31), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 368); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 46) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 32), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 46), vdupq_n_u64((1ULL << 18) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 384); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 28) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 33), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 36) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 400); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 10) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 34), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 10), vdupq_n_u64((1ULL << 46) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 35), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 416); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 38) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 36), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 38), vdupq_n_u64((1ULL << 26) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 432); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 20) - 1)) ,26), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 37), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 44) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 448); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 2) - 1)) ,44), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 38), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 2), vdupq_n_u64((1ULL << 46) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 39), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 464); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 30) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 40), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 30), vdupq_n_u64((1ULL << 34) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 480); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,34), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 41), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 46) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 42), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 58), vdupq_n_u64((1ULL << 6) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 496); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 40) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 43), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 512); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 22) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 44), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 22), vdupq_n_u64((1ULL << 42) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 528); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,42), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 45), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 46) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 46), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 50), vdupq_n_u64((1ULL << 14) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 544); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 47), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 560); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 14) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 48), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 14), vdupq_n_u64((1ULL << 46) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 49), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 576); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 42) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 50), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 42), vdupq_n_u64((1ULL << 22) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 592); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 51), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 40) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 608); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 6) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 52), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 6), vdupq_n_u64((1ULL << 46) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 53), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 624); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 34) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 54), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 34), vdupq_n_u64((1ULL << 30) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 640); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,30), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 55), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 46) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 56), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 62), vdupq_n_u64((1ULL << 2) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 656); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 44) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 57), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 20) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 672); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 26) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 58), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 26), vdupq_n_u64((1ULL << 38) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 688); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,38), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 59), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 46) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 60), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 54), vdupq_n_u64((1ULL << 10) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 704); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 36) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 61), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 28) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 720); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 18) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 62), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 18), vdupq_n_u64((1ULL << 46) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 63), tmp_dbl); + } + } + static void falp_47bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] uint64x2_t register_0; + [[maybe_unused]] uint64x2_t tmp_0; + [[maybe_unused]] int64x2_t base_0 = vmovq_n_u64(*(a_base_p)); + [[maybe_unused]] int64x2_t factor = vmovq_n_u64(alp::FACT_ARR[fac]); + [[maybe_unused]] float64x2_t frac10 = vmovq_n_f64(alp::Constants::FRAC_ARR[exp]); + [[maybe_unused]] float64x2_t tmp_dbl; + [[maybe_unused]] int64x2_t tmp_int; + for (int i = 0; i < 8; ++i) + { + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 0); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 47) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 0), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 47), vdupq_n_u64((1ULL << 17) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 16); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 30) - 1)) ,17), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 1), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 30), vdupq_n_u64((1ULL << 34) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 32); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 13) - 1)) ,34), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 2), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 13), vdupq_n_u64((1ULL << 47) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 3), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 48); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 43) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 4), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 43), vdupq_n_u64((1ULL << 21) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 64); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 26) - 1)) ,21), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 5), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 26), vdupq_n_u64((1ULL << 38) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 80); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 9) - 1)) ,38), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 6), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 9), vdupq_n_u64((1ULL << 47) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 7), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 96); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 39) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 8), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 39), vdupq_n_u64((1ULL << 25) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 112); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 22) - 1)) ,25), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 9), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 22), vdupq_n_u64((1ULL << 42) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 128); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 5) - 1)) ,42), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 10), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 5), vdupq_n_u64((1ULL << 47) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 11), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 144); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 35) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 12), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 35), vdupq_n_u64((1ULL << 29) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 160); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 18) - 1)) ,29), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 13), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 18), vdupq_n_u64((1ULL << 46) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 176); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 1) - 1)) ,46), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 14), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 1), vdupq_n_u64((1ULL << 47) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 15), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 192); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 31) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 16), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 31), vdupq_n_u64((1ULL << 33) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 208); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 14) - 1)) ,33), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 17), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 14), vdupq_n_u64((1ULL << 47) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 18), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 61), vdupq_n_u64((1ULL << 3) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 224); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 44) - 1)) ,3), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 19), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 20) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 240); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 27) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 20), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 27), vdupq_n_u64((1ULL << 37) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 256); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 10) - 1)) ,37), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 21), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 10), vdupq_n_u64((1ULL << 47) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 22), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 57), vdupq_n_u64((1ULL << 7) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 272); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 40) - 1)) ,7), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 23), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 288); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 23) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 24), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 23), vdupq_n_u64((1ULL << 41) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 304); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 6) - 1)) ,41), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 25), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 6), vdupq_n_u64((1ULL << 47) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 26), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 53), vdupq_n_u64((1ULL << 11) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 320); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 36) - 1)) ,11), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 27), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 28) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 336); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 19) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 28), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 19), vdupq_n_u64((1ULL << 45) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 352); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 2) - 1)) ,45), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 29), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 2), vdupq_n_u64((1ULL << 47) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 30), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 49), vdupq_n_u64((1ULL << 15) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 368); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,15), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 31), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 384); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 15) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 32), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 15), vdupq_n_u64((1ULL << 47) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 33), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 62), vdupq_n_u64((1ULL << 2) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 400); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 45) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 34), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 45), vdupq_n_u64((1ULL << 19) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 416); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 28) - 1)) ,19), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 35), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 36) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 432); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 11) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 36), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 11), vdupq_n_u64((1ULL << 47) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 37), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 58), vdupq_n_u64((1ULL << 6) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 448); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 41) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 38), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 41), vdupq_n_u64((1ULL << 23) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 464); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,23), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 39), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 40) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 480); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 7) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 40), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 7), vdupq_n_u64((1ULL << 47) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 41), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 54), vdupq_n_u64((1ULL << 10) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 496); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 37) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 42), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 37), vdupq_n_u64((1ULL << 27) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 512); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 20) - 1)) ,27), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 43), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 44) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 528); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 3) - 1)) ,44), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 44), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 3), vdupq_n_u64((1ULL << 47) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 45), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 50), vdupq_n_u64((1ULL << 14) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 544); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 33) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 46), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 33), vdupq_n_u64((1ULL << 31) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 560); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,31), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 47), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 47) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 48), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 63), vdupq_n_u64((1ULL << 1) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 576); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 46) - 1)) ,1), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 49), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 46), vdupq_n_u64((1ULL << 18) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 592); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 29) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 50), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 29), vdupq_n_u64((1ULL << 35) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 608); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,35), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 51), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 47) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 52), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 59), vdupq_n_u64((1ULL << 5) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 624); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 42) - 1)) ,5), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 53), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 42), vdupq_n_u64((1ULL << 22) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 640); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 25) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 54), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 25), vdupq_n_u64((1ULL << 39) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 656); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,39), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 55), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 47) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 56), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 55), vdupq_n_u64((1ULL << 9) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 672); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 38) - 1)) ,9), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 57), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 38), vdupq_n_u64((1ULL << 26) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 688); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 21) - 1)) ,26), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 58), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 21), vdupq_n_u64((1ULL << 43) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 704); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,43), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 59), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 47) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 60), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 51), vdupq_n_u64((1ULL << 13) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 720); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 34) - 1)) ,13), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 61), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 34), vdupq_n_u64((1ULL << 30) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 736); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 17) - 1)) ,30), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 62), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 17), vdupq_n_u64((1ULL << 47) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 63), tmp_dbl); + } + } + static void falp_48bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] uint64x2_t register_0; + [[maybe_unused]] uint64x2_t tmp_0; + [[maybe_unused]] int64x2_t base_0 = vmovq_n_u64(*(a_base_p)); + [[maybe_unused]] int64x2_t factor = vmovq_n_u64(alp::FACT_ARR[fac]); + [[maybe_unused]] float64x2_t frac10 = vmovq_n_f64(alp::Constants::FRAC_ARR[exp]); + [[maybe_unused]] float64x2_t tmp_dbl; + [[maybe_unused]] int64x2_t tmp_int; + for (int i = 0; i < 8; ++i) + { + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 0); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 48) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 0), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 16); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 1), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 32); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 2), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 48) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 3), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 48); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 48) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 4), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 64); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 5), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 80); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 6), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 48) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 7), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 96); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 48) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 8), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 112); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 9), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 128); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 10), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 48) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 11), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 144); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 48) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 12), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 160); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 13), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 176); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 14), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 48) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 15), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 192); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 48) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 16), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 208); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 17), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 224); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 18), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 48) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 19), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 240); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 48) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 20), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 256); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 21), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 272); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 22), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 48) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 23), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 288); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 48) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 24), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 304); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 25), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 320); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 26), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 48) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 27), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 336); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 48) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 28), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 352); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 29), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 368); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 30), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 48) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 31), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 384); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 48) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 32), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 400); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 33), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 416); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 34), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 48) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 35), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 432); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 48) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 36), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 448); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 37), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 464); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 38), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 48) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 39), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 480); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 48) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 40), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 496); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 41), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 512); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 42), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 48) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 43), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 528); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 48) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 44), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 544); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 45), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 560); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 46), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 48) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 47), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 576); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 48) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 48), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 592); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 49), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 608); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 50), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 48) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 51), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 624); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 48) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 52), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 640); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 53), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 656); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 54), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 48) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 55), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 672); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 48) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 56), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 688); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 57), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 704); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 58), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 48) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 59), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 720); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 48) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 60), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 736); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 61), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 752); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 62), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 48) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 63), tmp_dbl); + } + } + static void falp_49bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] uint64x2_t register_0; + [[maybe_unused]] uint64x2_t tmp_0; + [[maybe_unused]] int64x2_t base_0 = vmovq_n_u64(*(a_base_p)); + [[maybe_unused]] int64x2_t factor = vmovq_n_u64(alp::FACT_ARR[fac]); + [[maybe_unused]] float64x2_t frac10 = vmovq_n_f64(alp::Constants::FRAC_ARR[exp]); + [[maybe_unused]] float64x2_t tmp_dbl; + [[maybe_unused]] int64x2_t tmp_int; + for (int i = 0; i < 8; ++i) + { + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 0); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 49) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 0), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 49), vdupq_n_u64((1ULL << 15) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 16); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 34) - 1)) ,15), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 1), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 34), vdupq_n_u64((1ULL << 30) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 32); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 19) - 1)) ,30), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 2), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 19), vdupq_n_u64((1ULL << 45) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 48); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,45), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 3), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 49) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 4), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 53), vdupq_n_u64((1ULL << 11) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 64); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 38) - 1)) ,11), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 5), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 38), vdupq_n_u64((1ULL << 26) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 80); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 23) - 1)) ,26), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 6), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 23), vdupq_n_u64((1ULL << 41) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 96); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,41), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 7), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 49) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 8), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 57), vdupq_n_u64((1ULL << 7) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 112); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 42) - 1)) ,7), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 9), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 42), vdupq_n_u64((1ULL << 22) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 128); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 27) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 10), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 27), vdupq_n_u64((1ULL << 37) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 144); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,37), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 11), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 49) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 12), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 61), vdupq_n_u64((1ULL << 3) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 160); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 46) - 1)) ,3), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 13), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 46), vdupq_n_u64((1ULL << 18) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 176); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 31) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 14), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 31), vdupq_n_u64((1ULL << 33) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 192); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,33), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 15), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 48) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 208); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 1) - 1)) ,48), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 16), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 1), vdupq_n_u64((1ULL << 49) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 17), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 50), vdupq_n_u64((1ULL << 14) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 224); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 35) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 18), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 35), vdupq_n_u64((1ULL << 29) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 240); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 20) - 1)) ,29), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 19), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 44) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 256); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 5) - 1)) ,44), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 20), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 5), vdupq_n_u64((1ULL << 49) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 21), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 54), vdupq_n_u64((1ULL << 10) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 272); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 39) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 22), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 39), vdupq_n_u64((1ULL << 25) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 288); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,25), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 23), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 40) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 304); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 9) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 24), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 9), vdupq_n_u64((1ULL << 49) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 25), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 58), vdupq_n_u64((1ULL << 6) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 320); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 43) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 26), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 43), vdupq_n_u64((1ULL << 21) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 336); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 28) - 1)) ,21), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 27), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 36) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 352); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 13) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 28), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 13), vdupq_n_u64((1ULL << 49) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 29), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 62), vdupq_n_u64((1ULL << 2) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 368); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 47) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 30), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 47), vdupq_n_u64((1ULL << 17) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 384); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,17), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 31), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 400); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 17) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 32), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 17), vdupq_n_u64((1ULL << 47) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 416); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 2) - 1)) ,47), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 33), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 2), vdupq_n_u64((1ULL << 49) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 34), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 51), vdupq_n_u64((1ULL << 13) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 432); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 36) - 1)) ,13), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 35), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 28) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 448); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 21) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 36), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 21), vdupq_n_u64((1ULL << 43) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 464); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 6) - 1)) ,43), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 37), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 6), vdupq_n_u64((1ULL << 49) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 38), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 55), vdupq_n_u64((1ULL << 9) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 480); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 40) - 1)) ,9), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 39), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 496); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 25) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 40), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 25), vdupq_n_u64((1ULL << 39) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 512); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 10) - 1)) ,39), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 41), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 10), vdupq_n_u64((1ULL << 49) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 42), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 59), vdupq_n_u64((1ULL << 5) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 528); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 44) - 1)) ,5), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 43), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 20) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 544); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 29) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 44), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 29), vdupq_n_u64((1ULL << 35) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 560); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 14) - 1)) ,35), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 45), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 14), vdupq_n_u64((1ULL << 49) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 46), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 63), vdupq_n_u64((1ULL << 1) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 576); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 48) - 1)) ,1), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 47), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 592); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 33) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 48), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 33), vdupq_n_u64((1ULL << 31) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 608); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 18) - 1)) ,31), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 49), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 18), vdupq_n_u64((1ULL << 46) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 624); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 3) - 1)) ,46), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 50), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 3), vdupq_n_u64((1ULL << 49) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 51), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 640); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 37) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 52), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 37), vdupq_n_u64((1ULL << 27) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 656); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 22) - 1)) ,27), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 53), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 22), vdupq_n_u64((1ULL << 42) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 672); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 7) - 1)) ,42), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 54), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 7), vdupq_n_u64((1ULL << 49) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 55), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 688); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 41) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 56), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 41), vdupq_n_u64((1ULL << 23) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 704); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 26) - 1)) ,23), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 57), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 26), vdupq_n_u64((1ULL << 38) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 720); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 11) - 1)) ,38), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 58), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 11), vdupq_n_u64((1ULL << 49) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 59), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 736); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 45) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 60), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 45), vdupq_n_u64((1ULL << 19) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 752); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 30) - 1)) ,19), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 61), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 30), vdupq_n_u64((1ULL << 34) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 768); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 15) - 1)) ,34), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 62), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 15), vdupq_n_u64((1ULL << 49) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 63), tmp_dbl); + } + } + static void falp_50bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] uint64x2_t register_0; + [[maybe_unused]] uint64x2_t tmp_0; + [[maybe_unused]] int64x2_t base_0 = vmovq_n_u64(*(a_base_p)); + [[maybe_unused]] int64x2_t factor = vmovq_n_u64(alp::FACT_ARR[fac]); + [[maybe_unused]] float64x2_t frac10 = vmovq_n_f64(alp::Constants::FRAC_ARR[exp]); + [[maybe_unused]] float64x2_t tmp_dbl; + [[maybe_unused]] int64x2_t tmp_int; + for (int i = 0; i < 8; ++i) + { + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 0); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 50) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 0), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 50), vdupq_n_u64((1ULL << 14) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 16); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 36) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 1), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 28) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 32); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 22) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 2), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 22), vdupq_n_u64((1ULL << 42) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 48); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,42), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 3), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 50) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 4), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 58), vdupq_n_u64((1ULL << 6) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 64); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 44) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 5), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 20) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 80); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 30) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 6), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 30), vdupq_n_u64((1ULL << 34) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 96); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,34), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 7), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 48) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 112); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 2) - 1)) ,48), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 8), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 2), vdupq_n_u64((1ULL << 50) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 9), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 128); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 38) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 10), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 38), vdupq_n_u64((1ULL << 26) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 144); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,26), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 11), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 40) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 160); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 10) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 12), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 10), vdupq_n_u64((1ULL << 50) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 13), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 176); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 46) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 14), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 46), vdupq_n_u64((1ULL << 18) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 192); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 15), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 208); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 18) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 16), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 18), vdupq_n_u64((1ULL << 46) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 224); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,46), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 17), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 50) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 18), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 54), vdupq_n_u64((1ULL << 10) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 240); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 40) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 19), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 256); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 26) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 20), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 26), vdupq_n_u64((1ULL << 38) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 272); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,38), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 21), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 50) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 22), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 62), vdupq_n_u64((1ULL << 2) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 288); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 48) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 23), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 304); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 34) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 24), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 34), vdupq_n_u64((1ULL << 30) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 320); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 20) - 1)) ,30), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 25), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 44) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 336); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 6) - 1)) ,44), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 26), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 6), vdupq_n_u64((1ULL << 50) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 27), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 352); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 42) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 28), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 42), vdupq_n_u64((1ULL << 22) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 368); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 28) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 29), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 36) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 384); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 14) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 30), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 14), vdupq_n_u64((1ULL << 50) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 31), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 400); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 50) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 32), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 50), vdupq_n_u64((1ULL << 14) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 416); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 36) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 33), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 28) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 432); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 22) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 34), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 22), vdupq_n_u64((1ULL << 42) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 448); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,42), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 35), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 50) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 36), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 58), vdupq_n_u64((1ULL << 6) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 464); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 44) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 37), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 20) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 480); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 30) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 38), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 30), vdupq_n_u64((1ULL << 34) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 496); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,34), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 39), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 48) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 512); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 2) - 1)) ,48), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 40), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 2), vdupq_n_u64((1ULL << 50) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 41), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 528); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 38) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 42), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 38), vdupq_n_u64((1ULL << 26) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 544); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,26), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 43), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 40) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 560); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 10) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 44), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 10), vdupq_n_u64((1ULL << 50) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 45), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 576); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 46) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 46), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 46), vdupq_n_u64((1ULL << 18) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 592); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 47), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 608); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 18) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 48), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 18), vdupq_n_u64((1ULL << 46) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 624); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,46), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 49), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 50) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 50), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 54), vdupq_n_u64((1ULL << 10) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 640); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 40) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 51), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 656); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 26) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 52), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 26), vdupq_n_u64((1ULL << 38) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 672); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,38), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 53), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 50) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 54), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 62), vdupq_n_u64((1ULL << 2) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 688); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 48) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 55), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 704); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 34) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 56), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 34), vdupq_n_u64((1ULL << 30) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 720); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 20) - 1)) ,30), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 57), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 44) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 736); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 6) - 1)) ,44), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 58), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 6), vdupq_n_u64((1ULL << 50) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 59), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 752); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 42) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 60), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 42), vdupq_n_u64((1ULL << 22) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 768); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 28) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 61), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 36) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 784); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 14) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 62), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 14), vdupq_n_u64((1ULL << 50) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 63), tmp_dbl); + } + } + static void falp_51bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] uint64x2_t register_0; + [[maybe_unused]] uint64x2_t tmp_0; + [[maybe_unused]] int64x2_t base_0 = vmovq_n_u64(*(a_base_p)); + [[maybe_unused]] int64x2_t factor = vmovq_n_u64(alp::FACT_ARR[fac]); + [[maybe_unused]] float64x2_t frac10 = vmovq_n_f64(alp::Constants::FRAC_ARR[exp]); + [[maybe_unused]] float64x2_t tmp_dbl; + [[maybe_unused]] int64x2_t tmp_int; + for (int i = 0; i < 8; ++i) + { + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 0); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 51) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 0), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 51), vdupq_n_u64((1ULL << 13) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 16); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 38) - 1)) ,13), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 1), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 38), vdupq_n_u64((1ULL << 26) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 32); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 25) - 1)) ,26), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 2), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 25), vdupq_n_u64((1ULL << 39) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 48); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,39), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 3), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 51) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 4), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 63), vdupq_n_u64((1ULL << 1) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 64); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 50) - 1)) ,1), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 5), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 50), vdupq_n_u64((1ULL << 14) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 80); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 37) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 6), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 37), vdupq_n_u64((1ULL << 27) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 96); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,27), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 7), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 40) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 112); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 11) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 8), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 11), vdupq_n_u64((1ULL << 51) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 9), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 62), vdupq_n_u64((1ULL << 2) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 128); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 49) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 10), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 49), vdupq_n_u64((1ULL << 15) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 144); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 36) - 1)) ,15), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 11), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 28) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 160); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 23) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 12), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 23), vdupq_n_u64((1ULL << 41) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 176); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 10) - 1)) ,41), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 13), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 10), vdupq_n_u64((1ULL << 51) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 14), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 61), vdupq_n_u64((1ULL << 3) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 192); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 48) - 1)) ,3), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 15), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 208); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 35) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 16), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 35), vdupq_n_u64((1ULL << 29) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 224); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 22) - 1)) ,29), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 17), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 22), vdupq_n_u64((1ULL << 42) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 240); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 9) - 1)) ,42), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 18), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 9), vdupq_n_u64((1ULL << 51) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 19), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 256); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 47) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 20), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 47), vdupq_n_u64((1ULL << 17) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 272); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 34) - 1)) ,17), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 21), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 34), vdupq_n_u64((1ULL << 30) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 288); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 21) - 1)) ,30), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 22), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 21), vdupq_n_u64((1ULL << 43) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 304); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,43), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 23), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 51) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 24), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 59), vdupq_n_u64((1ULL << 5) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 320); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 46) - 1)) ,5), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 25), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 46), vdupq_n_u64((1ULL << 18) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 336); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 33) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 26), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 33), vdupq_n_u64((1ULL << 31) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 352); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 20) - 1)) ,31), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 27), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 44) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 368); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 7) - 1)) ,44), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 28), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 7), vdupq_n_u64((1ULL << 51) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 29), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 58), vdupq_n_u64((1ULL << 6) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 384); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 45) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 30), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 45), vdupq_n_u64((1ULL << 19) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 400); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,19), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 31), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 416); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 19) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 32), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 19), vdupq_n_u64((1ULL << 45) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 432); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 6) - 1)) ,45), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 33), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 6), vdupq_n_u64((1ULL << 51) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 34), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 57), vdupq_n_u64((1ULL << 7) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 448); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 44) - 1)) ,7), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 35), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 20) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 464); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 31) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 36), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 31), vdupq_n_u64((1ULL << 33) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 480); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 18) - 1)) ,33), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 37), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 18), vdupq_n_u64((1ULL << 46) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 496); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 5) - 1)) ,46), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 38), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 5), vdupq_n_u64((1ULL << 51) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 39), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 512); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 43) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 40), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 43), vdupq_n_u64((1ULL << 21) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 528); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 30) - 1)) ,21), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 41), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 30), vdupq_n_u64((1ULL << 34) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 544); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 17) - 1)) ,34), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 42), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 17), vdupq_n_u64((1ULL << 47) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 560); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,47), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 43), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 51) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 44), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 55), vdupq_n_u64((1ULL << 9) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 576); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 42) - 1)) ,9), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 45), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 42), vdupq_n_u64((1ULL << 22) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 592); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 29) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 46), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 29), vdupq_n_u64((1ULL << 35) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 608); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,35), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 47), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 48) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 624); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 3) - 1)) ,48), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 48), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 3), vdupq_n_u64((1ULL << 51) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 49), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 54), vdupq_n_u64((1ULL << 10) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 640); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 41) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 50), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 41), vdupq_n_u64((1ULL << 23) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 656); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 28) - 1)) ,23), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 51), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 36) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 672); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 15) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 52), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 15), vdupq_n_u64((1ULL << 49) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 688); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 2) - 1)) ,49), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 53), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 2), vdupq_n_u64((1ULL << 51) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 54), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 53), vdupq_n_u64((1ULL << 11) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 704); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 40) - 1)) ,11), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 55), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 720); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 27) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 56), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 27), vdupq_n_u64((1ULL << 37) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 736); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 14) - 1)) ,37), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 57), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 14), vdupq_n_u64((1ULL << 50) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 752); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 1) - 1)) ,50), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 58), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 1), vdupq_n_u64((1ULL << 51) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 59), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 768); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 39) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 60), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 39), vdupq_n_u64((1ULL << 25) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 784); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 26) - 1)) ,25), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 61), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 26), vdupq_n_u64((1ULL << 38) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 800); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 13) - 1)) ,38), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 62), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 13), vdupq_n_u64((1ULL << 51) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 63), tmp_dbl); + } + } + static void falp_52bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] uint64x2_t register_0; + [[maybe_unused]] uint64x2_t tmp_0; + [[maybe_unused]] int64x2_t base_0 = vmovq_n_u64(*(a_base_p)); + [[maybe_unused]] int64x2_t factor = vmovq_n_u64(alp::FACT_ARR[fac]); + [[maybe_unused]] float64x2_t frac10 = vmovq_n_f64(alp::Constants::FRAC_ARR[exp]); + [[maybe_unused]] float64x2_t tmp_dbl; + [[maybe_unused]] int64x2_t tmp_int; + for (int i = 0; i < 8; ++i) + { + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 0); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 52) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 0), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 16); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 40) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 1), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 32); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 28) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 2), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 36) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 48); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 3), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 48) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 64); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,48), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 4), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 52) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 5), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 80); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 44) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 6), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 20) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 96); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 7), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 112); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 20) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 8), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 44) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 128); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,44), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 9), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 52) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 10), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 144); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 48) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 11), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 160); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 36) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 12), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 28) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 176); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 13), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 40) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 192); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 14), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 52) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 15), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 208); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 52) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 16), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 224); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 40) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 17), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 240); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 28) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 18), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 36) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 256); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 19), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 48) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 272); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,48), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 20), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 52) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 21), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 288); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 44) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 22), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 20) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 304); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 23), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 320); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 20) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 24), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 44) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 336); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,44), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 25), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 52) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 26), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 352); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 48) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 27), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 368); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 36) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 28), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 28) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 384); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 29), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 40) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 400); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 30), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 52) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 31), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 416); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 52) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 32), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 432); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 40) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 33), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 448); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 28) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 34), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 36) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 464); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 35), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 48) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 480); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,48), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 36), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 52) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 37), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 496); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 44) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 38), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 20) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 512); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 39), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 528); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 20) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 40), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 44) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 544); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,44), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 41), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 52) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 42), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 560); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 48) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 43), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 576); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 36) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 44), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 28) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 592); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 45), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 40) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 608); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 46), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 52) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 47), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 624); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 52) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 48), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 640); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 40) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 49), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 656); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 28) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 50), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 36) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 672); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 51), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 48) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 688); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,48), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 52), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 52) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 53), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 704); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 44) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 54), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 20) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 720); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 55), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 736); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 20) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 56), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 44) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 752); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,44), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 57), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 52) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 58), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 768); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 48) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 59), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 784); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 36) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 60), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 28) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 800); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 61), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 40) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 816); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 62), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 52) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 63), tmp_dbl); + } + } + static void falp_53bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] uint64x2_t register_0; + [[maybe_unused]] uint64x2_t tmp_0; + [[maybe_unused]] int64x2_t base_0 = vmovq_n_u64(*(a_base_p)); + [[maybe_unused]] int64x2_t factor = vmovq_n_u64(alp::FACT_ARR[fac]); + [[maybe_unused]] float64x2_t frac10 = vmovq_n_f64(alp::Constants::FRAC_ARR[exp]); + [[maybe_unused]] float64x2_t tmp_dbl; + [[maybe_unused]] int64x2_t tmp_int; + for (int i = 0; i < 8; ++i) + { + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 0); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 53) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 0), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 53), vdupq_n_u64((1ULL << 11) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 16); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 42) - 1)) ,11), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 1), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 42), vdupq_n_u64((1ULL << 22) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 32); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 31) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 2), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 31), vdupq_n_u64((1ULL << 33) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 48); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 20) - 1)) ,33), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 3), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 44) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 64); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 9) - 1)) ,44), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 4), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 9), vdupq_n_u64((1ULL << 53) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 5), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 62), vdupq_n_u64((1ULL << 2) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 80); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 51) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 6), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 51), vdupq_n_u64((1ULL << 13) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 96); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 40) - 1)) ,13), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 7), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 112); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 29) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 8), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 29), vdupq_n_u64((1ULL << 35) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 128); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 18) - 1)) ,35), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 9), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 18), vdupq_n_u64((1ULL << 46) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 144); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 7) - 1)) ,46), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 10), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 7), vdupq_n_u64((1ULL << 53) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 11), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 160); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 49) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 12), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 49), vdupq_n_u64((1ULL << 15) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 176); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 38) - 1)) ,15), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 13), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 38), vdupq_n_u64((1ULL << 26) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 192); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 27) - 1)) ,26), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 14), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 27), vdupq_n_u64((1ULL << 37) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 208); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,37), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 15), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 48) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 224); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 5) - 1)) ,48), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 16), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 5), vdupq_n_u64((1ULL << 53) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 17), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 58), vdupq_n_u64((1ULL << 6) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 240); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 47) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 18), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 47), vdupq_n_u64((1ULL << 17) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 256); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 36) - 1)) ,17), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 19), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 28) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 272); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 25) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 20), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 25), vdupq_n_u64((1ULL << 39) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 288); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 14) - 1)) ,39), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 21), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 14), vdupq_n_u64((1ULL << 50) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 304); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 3) - 1)) ,50), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 22), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 3), vdupq_n_u64((1ULL << 53) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 23), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 320); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 45) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 24), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 45), vdupq_n_u64((1ULL << 19) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 336); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 34) - 1)) ,19), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 25), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 34), vdupq_n_u64((1ULL << 30) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 352); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 23) - 1)) ,30), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 26), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 23), vdupq_n_u64((1ULL << 41) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 368); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,41), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 27), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 52) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 384); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 1) - 1)) ,52), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 28), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 1), vdupq_n_u64((1ULL << 53) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 29), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 54), vdupq_n_u64((1ULL << 10) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 400); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 43) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 30), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 43), vdupq_n_u64((1ULL << 21) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 416); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,21), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 31), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 432); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 21) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 32), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 21), vdupq_n_u64((1ULL << 43) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 448); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 10) - 1)) ,43), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 33), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 10), vdupq_n_u64((1ULL << 53) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 34), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 63), vdupq_n_u64((1ULL << 1) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 464); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 52) - 1)) ,1), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 35), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 480); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 41) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 36), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 41), vdupq_n_u64((1ULL << 23) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 496); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 30) - 1)) ,23), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 37), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 30), vdupq_n_u64((1ULL << 34) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 512); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 19) - 1)) ,34), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 38), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 19), vdupq_n_u64((1ULL << 45) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 528); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,45), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 39), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 53) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 40), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 61), vdupq_n_u64((1ULL << 3) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 544); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 50) - 1)) ,3), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 41), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 50), vdupq_n_u64((1ULL << 14) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 560); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 39) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 42), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 39), vdupq_n_u64((1ULL << 25) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 576); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 28) - 1)) ,25), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 43), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 36) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 592); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 17) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 44), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 17), vdupq_n_u64((1ULL << 47) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 608); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 6) - 1)) ,47), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 45), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 6), vdupq_n_u64((1ULL << 53) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 46), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 59), vdupq_n_u64((1ULL << 5) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 624); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 48) - 1)) ,5), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 47), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 640); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 37) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 48), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 37), vdupq_n_u64((1ULL << 27) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 656); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 26) - 1)) ,27), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 49), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 26), vdupq_n_u64((1ULL << 38) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 672); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 15) - 1)) ,38), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 50), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 15), vdupq_n_u64((1ULL << 49) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 688); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,49), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 51), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 53) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 52), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 57), vdupq_n_u64((1ULL << 7) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 704); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 46) - 1)) ,7), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 53), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 46), vdupq_n_u64((1ULL << 18) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 720); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 35) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 54), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 35), vdupq_n_u64((1ULL << 29) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 736); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,29), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 55), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 40) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 752); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 13) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 56), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 13), vdupq_n_u64((1ULL << 51) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 768); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 2) - 1)) ,51), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 57), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 2), vdupq_n_u64((1ULL << 53) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 58), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 55), vdupq_n_u64((1ULL << 9) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 784); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 44) - 1)) ,9), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 59), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 20) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 800); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 33) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 60), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 33), vdupq_n_u64((1ULL << 31) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 816); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 22) - 1)) ,31), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 61), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 22), vdupq_n_u64((1ULL << 42) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 832); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 11) - 1)) ,42), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 62), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 11), vdupq_n_u64((1ULL << 53) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 63), tmp_dbl); + } + } + static void falp_54bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] uint64x2_t register_0; + [[maybe_unused]] uint64x2_t tmp_0; + [[maybe_unused]] int64x2_t base_0 = vmovq_n_u64(*(a_base_p)); + [[maybe_unused]] int64x2_t factor = vmovq_n_u64(alp::FACT_ARR[fac]); + [[maybe_unused]] float64x2_t frac10 = vmovq_n_f64(alp::Constants::FRAC_ARR[exp]); + [[maybe_unused]] float64x2_t tmp_dbl; + [[maybe_unused]] int64x2_t tmp_int; + for (int i = 0; i < 8; ++i) + { + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 0); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 54) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 0), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 54), vdupq_n_u64((1ULL << 10) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 16); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 44) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 1), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 20) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 32); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 34) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 2), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 34), vdupq_n_u64((1ULL << 30) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 48); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,30), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 3), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 40) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 64); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 14) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 4), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 14), vdupq_n_u64((1ULL << 50) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 80); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,50), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 5), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 54) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 6), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 58), vdupq_n_u64((1ULL << 6) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 96); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 48) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 7), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 112); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 38) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 8), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 38), vdupq_n_u64((1ULL << 26) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 128); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 28) - 1)) ,26), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 9), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 36) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 144); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 18) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 10), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 18), vdupq_n_u64((1ULL << 46) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 160); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,46), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 11), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 54) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 12), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 62), vdupq_n_u64((1ULL << 2) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 176); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 52) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 13), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 192); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 42) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 14), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 42), vdupq_n_u64((1ULL << 22) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 208); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 15), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 224); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 22) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 16), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 22), vdupq_n_u64((1ULL << 42) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 240); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,42), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 17), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 52) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 256); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 2) - 1)) ,52), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 18), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 2), vdupq_n_u64((1ULL << 54) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 19), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 272); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 46) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 20), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 46), vdupq_n_u64((1ULL << 18) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 288); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 36) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 21), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 28) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 304); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 26) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 22), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 26), vdupq_n_u64((1ULL << 38) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 320); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,38), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 23), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 48) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 336); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 6) - 1)) ,48), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 24), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 6), vdupq_n_u64((1ULL << 54) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 25), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 352); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 50) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 26), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 50), vdupq_n_u64((1ULL << 14) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 368); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 40) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 27), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 384); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 30) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 28), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 30), vdupq_n_u64((1ULL << 34) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 400); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 20) - 1)) ,34), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 29), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 44) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 416); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 10) - 1)) ,44), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 30), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 10), vdupq_n_u64((1ULL << 54) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 31), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 432); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 54) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 32), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 54), vdupq_n_u64((1ULL << 10) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 448); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 44) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 33), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 20) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 464); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 34) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 34), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 34), vdupq_n_u64((1ULL << 30) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 480); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,30), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 35), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 40) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 496); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 14) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 36), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 14), vdupq_n_u64((1ULL << 50) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 512); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,50), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 37), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 54) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 38), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 58), vdupq_n_u64((1ULL << 6) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 528); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 48) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 39), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 544); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 38) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 40), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 38), vdupq_n_u64((1ULL << 26) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 560); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 28) - 1)) ,26), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 41), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 36) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 576); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 18) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 42), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 18), vdupq_n_u64((1ULL << 46) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 592); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,46), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 43), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 54) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 44), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 62), vdupq_n_u64((1ULL << 2) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 608); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 52) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 45), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 624); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 42) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 46), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 42), vdupq_n_u64((1ULL << 22) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 640); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 47), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 656); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 22) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 48), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 22), vdupq_n_u64((1ULL << 42) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 672); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,42), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 49), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 52) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 688); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 2) - 1)) ,52), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 50), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 2), vdupq_n_u64((1ULL << 54) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 51), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 704); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 46) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 52), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 46), vdupq_n_u64((1ULL << 18) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 720); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 36) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 53), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 28) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 736); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 26) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 54), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 26), vdupq_n_u64((1ULL << 38) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 752); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,38), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 55), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 48) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 768); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 6) - 1)) ,48), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 56), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 6), vdupq_n_u64((1ULL << 54) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 57), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 784); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 50) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 58), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 50), vdupq_n_u64((1ULL << 14) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 800); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 40) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 59), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 816); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 30) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 60), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 30), vdupq_n_u64((1ULL << 34) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 832); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 20) - 1)) ,34), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 61), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 44) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 848); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 10) - 1)) ,44), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 62), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 10), vdupq_n_u64((1ULL << 54) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 63), tmp_dbl); + } + } + static void falp_55bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] uint64x2_t register_0; + [[maybe_unused]] uint64x2_t tmp_0; + [[maybe_unused]] int64x2_t base_0 = vmovq_n_u64(*(a_base_p)); + [[maybe_unused]] int64x2_t factor = vmovq_n_u64(alp::FACT_ARR[fac]); + [[maybe_unused]] float64x2_t frac10 = vmovq_n_f64(alp::Constants::FRAC_ARR[exp]); + [[maybe_unused]] float64x2_t tmp_dbl; + [[maybe_unused]] int64x2_t tmp_int; + for (int i = 0; i < 8; ++i) + { + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 0); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 55) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 0), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 55), vdupq_n_u64((1ULL << 9) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 16); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 46) - 1)) ,9), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 1), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 46), vdupq_n_u64((1ULL << 18) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 32); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 37) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 2), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 37), vdupq_n_u64((1ULL << 27) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 48); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 28) - 1)) ,27), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 3), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 36) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 64); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 19) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 4), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 19), vdupq_n_u64((1ULL << 45) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 80); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 10) - 1)) ,45), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 5), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 10), vdupq_n_u64((1ULL << 54) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 96); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 1) - 1)) ,54), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 6), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 1), vdupq_n_u64((1ULL << 55) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 7), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 112); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 47) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 8), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 47), vdupq_n_u64((1ULL << 17) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 128); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 38) - 1)) ,17), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 9), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 38), vdupq_n_u64((1ULL << 26) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 144); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 29) - 1)) ,26), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 10), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 29), vdupq_n_u64((1ULL << 35) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 160); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 20) - 1)) ,35), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 11), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 44) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 176); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 11) - 1)) ,44), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 12), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 11), vdupq_n_u64((1ULL << 53) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 192); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 2) - 1)) ,53), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 13), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 2), vdupq_n_u64((1ULL << 55) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 14), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 57), vdupq_n_u64((1ULL << 7) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 208); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 48) - 1)) ,7), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 15), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 224); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 39) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 16), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 39), vdupq_n_u64((1ULL << 25) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 240); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 30) - 1)) ,25), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 17), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 30), vdupq_n_u64((1ULL << 34) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 256); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 21) - 1)) ,34), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 18), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 21), vdupq_n_u64((1ULL << 43) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 272); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,43), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 19), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 52) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 288); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 3) - 1)) ,52), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 20), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 3), vdupq_n_u64((1ULL << 55) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 21), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 58), vdupq_n_u64((1ULL << 6) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 304); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 49) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 22), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 49), vdupq_n_u64((1ULL << 15) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 320); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 40) - 1)) ,15), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 23), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 336); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 31) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 24), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 31), vdupq_n_u64((1ULL << 33) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 352); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 22) - 1)) ,33), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 25), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 22), vdupq_n_u64((1ULL << 42) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 368); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 13) - 1)) ,42), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 26), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 13), vdupq_n_u64((1ULL << 51) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 384); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,51), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 27), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 55) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 28), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 59), vdupq_n_u64((1ULL << 5) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 400); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 50) - 1)) ,5), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 29), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 50), vdupq_n_u64((1ULL << 14) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 416); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 41) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 30), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 41), vdupq_n_u64((1ULL << 23) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 432); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,23), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 31), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 448); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 23) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 32), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 23), vdupq_n_u64((1ULL << 41) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 464); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 14) - 1)) ,41), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 33), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 14), vdupq_n_u64((1ULL << 50) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 480); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 5) - 1)) ,50), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 34), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 5), vdupq_n_u64((1ULL << 55) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 35), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 496); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 51) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 36), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 51), vdupq_n_u64((1ULL << 13) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 512); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 42) - 1)) ,13), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 37), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 42), vdupq_n_u64((1ULL << 22) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 528); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 33) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 38), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 33), vdupq_n_u64((1ULL << 31) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 544); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,31), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 39), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 40) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 560); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 15) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 40), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 15), vdupq_n_u64((1ULL << 49) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 576); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 6) - 1)) ,49), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 41), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 6), vdupq_n_u64((1ULL << 55) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 42), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 61), vdupq_n_u64((1ULL << 3) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 592); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 52) - 1)) ,3), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 43), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 608); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 43) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 44), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 43), vdupq_n_u64((1ULL << 21) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 624); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 34) - 1)) ,21), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 45), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 34), vdupq_n_u64((1ULL << 30) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 640); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 25) - 1)) ,30), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 46), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 25), vdupq_n_u64((1ULL << 39) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 656); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,39), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 47), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 48) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 672); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 7) - 1)) ,48), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 48), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 7), vdupq_n_u64((1ULL << 55) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 49), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 62), vdupq_n_u64((1ULL << 2) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 688); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 53) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 50), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 53), vdupq_n_u64((1ULL << 11) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 704); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 44) - 1)) ,11), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 51), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 20) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 720); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 35) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 52), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 35), vdupq_n_u64((1ULL << 29) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 736); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 26) - 1)) ,29), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 53), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 26), vdupq_n_u64((1ULL << 38) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 752); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 17) - 1)) ,38), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 54), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 17), vdupq_n_u64((1ULL << 47) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 768); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,47), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 55), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 55) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 56), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 63), vdupq_n_u64((1ULL << 1) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 784); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 54) - 1)) ,1), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 57), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 54), vdupq_n_u64((1ULL << 10) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 800); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 45) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 58), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 45), vdupq_n_u64((1ULL << 19) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 816); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 36) - 1)) ,19), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 59), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 28) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 832); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 27) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 60), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 27), vdupq_n_u64((1ULL << 37) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 848); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 18) - 1)) ,37), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 61), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 18), vdupq_n_u64((1ULL << 46) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 864); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 9) - 1)) ,46), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 62), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 9), vdupq_n_u64((1ULL << 55) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 63), tmp_dbl); + } + } + static void falp_56bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] uint64x2_t register_0; + [[maybe_unused]] uint64x2_t tmp_0; + [[maybe_unused]] int64x2_t base_0 = vmovq_n_u64(*(a_base_p)); + [[maybe_unused]] int64x2_t factor = vmovq_n_u64(alp::FACT_ARR[fac]); + [[maybe_unused]] float64x2_t frac10 = vmovq_n_f64(alp::Constants::FRAC_ARR[exp]); + [[maybe_unused]] float64x2_t tmp_dbl; + [[maybe_unused]] int64x2_t tmp_int; + for (int i = 0; i < 8; ++i) + { + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 0); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 56) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 0), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 16); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 48) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 1), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 32); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 40) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 2), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 48); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 3), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 64); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 4), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 40) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 80); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 5), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 48) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 96); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,48), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 6), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 56) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 7), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 112); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 56) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 8), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 128); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 48) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 9), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 144); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 40) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 10), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 160); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 11), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 176); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 12), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 40) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 192); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 13), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 48) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 208); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,48), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 14), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 56) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 15), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 224); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 56) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 16), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 240); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 48) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 17), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 256); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 40) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 18), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 272); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 19), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 288); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 20), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 40) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 304); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 21), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 48) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 320); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,48), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 22), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 56) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 23), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 336); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 56) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 24), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 352); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 48) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 25), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 368); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 40) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 26), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 384); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 27), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 400); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 28), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 40) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 416); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 29), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 48) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 432); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,48), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 30), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 56) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 31), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 448); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 56) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 32), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 464); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 48) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 33), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 480); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 40) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 34), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 496); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 35), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 512); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 36), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 40) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 528); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 37), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 48) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 544); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,48), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 38), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 56) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 39), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 560); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 56) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 40), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 576); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 48) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 41), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 592); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 40) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 42), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 608); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 43), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 624); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 44), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 40) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 640); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 45), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 48) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 656); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,48), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 46), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 56) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 47), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 672); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 56) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 48), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 688); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 48) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 49), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 704); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 40) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 50), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 720); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 51), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 736); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 52), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 40) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 752); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 53), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 48) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 768); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,48), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 54), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 56) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 55), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 784); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 56) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 56), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 800); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 48) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 57), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 816); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 40) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 58), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 832); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 59), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 848); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 60), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 40) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 864); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 61), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 48) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 880); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,48), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 62), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 56) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 63), tmp_dbl); + } + } + static void falp_57bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] uint64x2_t register_0; + [[maybe_unused]] uint64x2_t tmp_0; + [[maybe_unused]] int64x2_t base_0 = vmovq_n_u64(*(a_base_p)); + [[maybe_unused]] int64x2_t factor = vmovq_n_u64(alp::FACT_ARR[fac]); + [[maybe_unused]] float64x2_t frac10 = vmovq_n_f64(alp::Constants::FRAC_ARR[exp]); + [[maybe_unused]] float64x2_t tmp_dbl; + [[maybe_unused]] int64x2_t tmp_int; + for (int i = 0; i < 8; ++i) + { + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 0); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 57) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 0), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 57), vdupq_n_u64((1ULL << 7) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 16); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 50) - 1)) ,7), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 1), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 50), vdupq_n_u64((1ULL << 14) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 32); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 43) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 2), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 43), vdupq_n_u64((1ULL << 21) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 48); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 36) - 1)) ,21), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 3), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 28) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 64); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 29) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 4), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 29), vdupq_n_u64((1ULL << 35) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 80); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 22) - 1)) ,35), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 5), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 22), vdupq_n_u64((1ULL << 42) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 96); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 15) - 1)) ,42), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 6), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 15), vdupq_n_u64((1ULL << 49) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 112); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,49), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 7), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 56) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 128); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 1) - 1)) ,56), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 8), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 1), vdupq_n_u64((1ULL << 57) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 9), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 58), vdupq_n_u64((1ULL << 6) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 144); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 51) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 10), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 51), vdupq_n_u64((1ULL << 13) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 160); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 44) - 1)) ,13), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 11), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 20) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 176); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 37) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 12), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 37), vdupq_n_u64((1ULL << 27) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 192); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 30) - 1)) ,27), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 13), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 30), vdupq_n_u64((1ULL << 34) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 208); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 23) - 1)) ,34), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 14), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 23), vdupq_n_u64((1ULL << 41) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 224); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,41), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 15), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 48) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 240); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 9) - 1)) ,48), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 16), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 9), vdupq_n_u64((1ULL << 55) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 256); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 2) - 1)) ,55), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 17), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 2), vdupq_n_u64((1ULL << 57) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 18), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 59), vdupq_n_u64((1ULL << 5) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 272); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 52) - 1)) ,5), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 19), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 288); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 45) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 20), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 45), vdupq_n_u64((1ULL << 19) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 304); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 38) - 1)) ,19), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 21), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 38), vdupq_n_u64((1ULL << 26) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 320); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 31) - 1)) ,26), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 22), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 31), vdupq_n_u64((1ULL << 33) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 336); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,33), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 23), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 40) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 352); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 17) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 24), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 17), vdupq_n_u64((1ULL << 47) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 368); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 10) - 1)) ,47), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 25), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 10), vdupq_n_u64((1ULL << 54) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 384); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 3) - 1)) ,54), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 26), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 3), vdupq_n_u64((1ULL << 57) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 27), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 400); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 53) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 28), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 53), vdupq_n_u64((1ULL << 11) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 416); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 46) - 1)) ,11), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 29), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 46), vdupq_n_u64((1ULL << 18) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 432); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 39) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 30), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 39), vdupq_n_u64((1ULL << 25) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 448); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,25), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 31), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 464); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 25) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 32), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 25), vdupq_n_u64((1ULL << 39) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 480); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 18) - 1)) ,39), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 33), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 18), vdupq_n_u64((1ULL << 46) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 496); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 11) - 1)) ,46), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 34), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 11), vdupq_n_u64((1ULL << 53) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 512); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,53), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 35), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 57) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 36), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 61), vdupq_n_u64((1ULL << 3) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 528); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 54) - 1)) ,3), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 37), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 54), vdupq_n_u64((1ULL << 10) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 544); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 47) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 38), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 47), vdupq_n_u64((1ULL << 17) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 560); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 40) - 1)) ,17), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 39), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 576); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 33) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 40), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 33), vdupq_n_u64((1ULL << 31) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 592); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 26) - 1)) ,31), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 41), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 26), vdupq_n_u64((1ULL << 38) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 608); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 19) - 1)) ,38), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 42), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 19), vdupq_n_u64((1ULL << 45) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 624); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,45), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 43), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 52) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 640); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 5) - 1)) ,52), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 44), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 5), vdupq_n_u64((1ULL << 57) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 45), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 62), vdupq_n_u64((1ULL << 2) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 656); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 55) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 46), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 55), vdupq_n_u64((1ULL << 9) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 672); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 48) - 1)) ,9), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 47), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 688); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 41) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 48), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 41), vdupq_n_u64((1ULL << 23) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 704); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 34) - 1)) ,23), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 49), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 34), vdupq_n_u64((1ULL << 30) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 720); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 27) - 1)) ,30), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 50), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 27), vdupq_n_u64((1ULL << 37) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 736); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 20) - 1)) ,37), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 51), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 44) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 752); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 13) - 1)) ,44), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 52), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 13), vdupq_n_u64((1ULL << 51) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 768); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 6) - 1)) ,51), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 53), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 6), vdupq_n_u64((1ULL << 57) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 54), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 63), vdupq_n_u64((1ULL << 1) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 784); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 56) - 1)) ,1), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 55), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 800); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 49) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 56), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 49), vdupq_n_u64((1ULL << 15) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 816); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 42) - 1)) ,15), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 57), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 42), vdupq_n_u64((1ULL << 22) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 832); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 35) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 58), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 35), vdupq_n_u64((1ULL << 29) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 848); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 28) - 1)) ,29), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 59), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 36) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 864); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 21) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 60), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 21), vdupq_n_u64((1ULL << 43) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 880); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 14) - 1)) ,43), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 61), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 14), vdupq_n_u64((1ULL << 50) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 896); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 7) - 1)) ,50), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 62), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 7), vdupq_n_u64((1ULL << 57) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 63), tmp_dbl); + } + } + static void falp_58bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] uint64x2_t register_0; + [[maybe_unused]] uint64x2_t tmp_0; + [[maybe_unused]] int64x2_t base_0 = vmovq_n_u64(*(a_base_p)); + [[maybe_unused]] int64x2_t factor = vmovq_n_u64(alp::FACT_ARR[fac]); + [[maybe_unused]] float64x2_t frac10 = vmovq_n_f64(alp::Constants::FRAC_ARR[exp]); + [[maybe_unused]] float64x2_t tmp_dbl; + [[maybe_unused]] int64x2_t tmp_int; + for (int i = 0; i < 8; ++i) + { + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 0); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 58) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 0), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 58), vdupq_n_u64((1ULL << 6) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 16); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 52) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 1), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 32); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 46) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 2), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 46), vdupq_n_u64((1ULL << 18) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 48); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 40) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 3), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 64); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 34) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 4), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 34), vdupq_n_u64((1ULL << 30) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 80); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 28) - 1)) ,30), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 5), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 36) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 96); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 22) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 6), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 22), vdupq_n_u64((1ULL << 42) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 112); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,42), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 7), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 48) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 128); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 10) - 1)) ,48), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 8), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 10), vdupq_n_u64((1ULL << 54) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 144); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,54), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 9), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 58) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 10), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 62), vdupq_n_u64((1ULL << 2) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 160); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 56) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 11), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 176); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 50) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 12), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 50), vdupq_n_u64((1ULL << 14) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 192); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 44) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 13), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 20) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 208); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 38) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 14), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 38), vdupq_n_u64((1ULL << 26) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 224); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,26), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 15), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 240); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 26) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 16), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 26), vdupq_n_u64((1ULL << 38) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 256); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 20) - 1)) ,38), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 17), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 44) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 272); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 14) - 1)) ,44), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 18), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 14), vdupq_n_u64((1ULL << 50) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 288); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,50), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 19), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 56) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 304); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 2) - 1)) ,56), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 20), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 2), vdupq_n_u64((1ULL << 58) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 21), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 320); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 54) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 22), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 54), vdupq_n_u64((1ULL << 10) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 336); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 48) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 23), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 352); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 42) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 24), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 42), vdupq_n_u64((1ULL << 22) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 368); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 36) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 25), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 28) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 384); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 30) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 26), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 30), vdupq_n_u64((1ULL << 34) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 400); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,34), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 27), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 40) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 416); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 18) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 28), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 18), vdupq_n_u64((1ULL << 46) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 432); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,46), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 29), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 52) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 448); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 6) - 1)) ,52), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 30), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 6), vdupq_n_u64((1ULL << 58) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 31), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 464); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 58) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 32), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 58), vdupq_n_u64((1ULL << 6) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 480); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 52) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 33), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 496); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 46) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 34), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 46), vdupq_n_u64((1ULL << 18) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 512); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 40) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 35), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 528); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 34) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 36), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 34), vdupq_n_u64((1ULL << 30) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 544); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 28) - 1)) ,30), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 37), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 36) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 560); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 22) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 38), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 22), vdupq_n_u64((1ULL << 42) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 576); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,42), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 39), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 48) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 592); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 10) - 1)) ,48), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 40), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 10), vdupq_n_u64((1ULL << 54) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 608); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,54), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 41), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 58) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 42), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 62), vdupq_n_u64((1ULL << 2) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 624); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 56) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 43), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 640); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 50) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 44), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 50), vdupq_n_u64((1ULL << 14) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 656); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 44) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 45), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 20) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 672); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 38) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 46), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 38), vdupq_n_u64((1ULL << 26) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 688); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,26), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 47), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 704); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 26) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 48), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 26), vdupq_n_u64((1ULL << 38) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 720); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 20) - 1)) ,38), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 49), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 44) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 736); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 14) - 1)) ,44), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 50), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 14), vdupq_n_u64((1ULL << 50) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 752); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,50), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 51), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 56) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 768); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 2) - 1)) ,56), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 52), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 2), vdupq_n_u64((1ULL << 58) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 53), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 784); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 54) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 54), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 54), vdupq_n_u64((1ULL << 10) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 800); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 48) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 55), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 816); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 42) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 56), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 42), vdupq_n_u64((1ULL << 22) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 832); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 36) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 57), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 28) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 848); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 30) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 58), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 30), vdupq_n_u64((1ULL << 34) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 864); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,34), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 59), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 40) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 880); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 18) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 60), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 18), vdupq_n_u64((1ULL << 46) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 896); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,46), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 61), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 52) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 912); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 6) - 1)) ,52), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 62), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 6), vdupq_n_u64((1ULL << 58) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 63), tmp_dbl); + } + } + static void falp_59bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] uint64x2_t register_0; + [[maybe_unused]] uint64x2_t tmp_0; + [[maybe_unused]] int64x2_t base_0 = vmovq_n_u64(*(a_base_p)); + [[maybe_unused]] int64x2_t factor = vmovq_n_u64(alp::FACT_ARR[fac]); + [[maybe_unused]] float64x2_t frac10 = vmovq_n_f64(alp::Constants::FRAC_ARR[exp]); + [[maybe_unused]] float64x2_t tmp_dbl; + [[maybe_unused]] int64x2_t tmp_int; + for (int i = 0; i < 8; ++i) + { + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 0); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 59) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 0), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 59), vdupq_n_u64((1ULL << 5) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 16); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 54) - 1)) ,5), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 1), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 54), vdupq_n_u64((1ULL << 10) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 32); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 49) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 2), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 49), vdupq_n_u64((1ULL << 15) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 48); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 44) - 1)) ,15), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 3), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 20) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 64); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 39) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 4), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 39), vdupq_n_u64((1ULL << 25) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 80); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 34) - 1)) ,25), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 5), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 34), vdupq_n_u64((1ULL << 30) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 96); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 29) - 1)) ,30), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 6), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 29), vdupq_n_u64((1ULL << 35) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 112); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,35), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 7), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 40) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 128); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 19) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 8), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 19), vdupq_n_u64((1ULL << 45) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 144); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 14) - 1)) ,45), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 9), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 14), vdupq_n_u64((1ULL << 50) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 160); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 9) - 1)) ,50), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 10), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 9), vdupq_n_u64((1ULL << 55) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 176); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,55), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 11), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 59) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 12), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 63), vdupq_n_u64((1ULL << 1) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 192); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 58) - 1)) ,1), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 13), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 58), vdupq_n_u64((1ULL << 6) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 208); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 53) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 14), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 53), vdupq_n_u64((1ULL << 11) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 224); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 48) - 1)) ,11), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 15), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 240); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 43) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 16), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 43), vdupq_n_u64((1ULL << 21) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 256); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 38) - 1)) ,21), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 17), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 38), vdupq_n_u64((1ULL << 26) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 272); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 33) - 1)) ,26), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 18), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 33), vdupq_n_u64((1ULL << 31) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 288); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 28) - 1)) ,31), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 19), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 36) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 304); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 23) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 20), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 23), vdupq_n_u64((1ULL << 41) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 320); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 18) - 1)) ,41), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 21), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 18), vdupq_n_u64((1ULL << 46) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 336); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 13) - 1)) ,46), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 22), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 13), vdupq_n_u64((1ULL << 51) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 352); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,51), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 23), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 56) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 368); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 3) - 1)) ,56), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 24), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 3), vdupq_n_u64((1ULL << 59) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 25), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 62), vdupq_n_u64((1ULL << 2) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 384); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 57) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 26), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 57), vdupq_n_u64((1ULL << 7) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 400); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 52) - 1)) ,7), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 27), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 416); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 47) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 28), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 47), vdupq_n_u64((1ULL << 17) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 432); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 42) - 1)) ,17), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 29), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 42), vdupq_n_u64((1ULL << 22) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 448); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 37) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 30), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 37), vdupq_n_u64((1ULL << 27) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 464); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,27), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 31), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 480); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 27) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 32), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 27), vdupq_n_u64((1ULL << 37) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 496); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 22) - 1)) ,37), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 33), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 22), vdupq_n_u64((1ULL << 42) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 512); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 17) - 1)) ,42), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 34), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 17), vdupq_n_u64((1ULL << 47) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 528); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,47), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 35), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 52) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 544); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 7) - 1)) ,52), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 36), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 7), vdupq_n_u64((1ULL << 57) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 560); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 2) - 1)) ,57), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 37), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 2), vdupq_n_u64((1ULL << 59) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 38), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 61), vdupq_n_u64((1ULL << 3) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 576); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 56) - 1)) ,3), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 39), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 592); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 51) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 40), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 51), vdupq_n_u64((1ULL << 13) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 608); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 46) - 1)) ,13), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 41), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 46), vdupq_n_u64((1ULL << 18) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 624); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 41) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 42), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 41), vdupq_n_u64((1ULL << 23) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 640); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 36) - 1)) ,23), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 43), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 28) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 656); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 31) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 44), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 31), vdupq_n_u64((1ULL << 33) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 672); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 26) - 1)) ,33), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 45), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 26), vdupq_n_u64((1ULL << 38) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 688); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 21) - 1)) ,38), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 46), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 21), vdupq_n_u64((1ULL << 43) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 704); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,43), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 47), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 48) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 720); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 11) - 1)) ,48), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 48), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 11), vdupq_n_u64((1ULL << 53) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 736); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 6) - 1)) ,53), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 49), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 6), vdupq_n_u64((1ULL << 58) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 752); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 1) - 1)) ,58), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 50), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 1), vdupq_n_u64((1ULL << 59) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 51), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 768); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 55) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 52), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 55), vdupq_n_u64((1ULL << 9) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 784); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 50) - 1)) ,9), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 53), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 50), vdupq_n_u64((1ULL << 14) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 800); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 45) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 54), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 45), vdupq_n_u64((1ULL << 19) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 816); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 40) - 1)) ,19), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 55), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 832); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 35) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 56), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 35), vdupq_n_u64((1ULL << 29) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 848); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 30) - 1)) ,29), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 57), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 30), vdupq_n_u64((1ULL << 34) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 864); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 25) - 1)) ,34), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 58), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 25), vdupq_n_u64((1ULL << 39) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 880); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 20) - 1)) ,39), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 59), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 44) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 896); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 15) - 1)) ,44), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 60), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 15), vdupq_n_u64((1ULL << 49) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 912); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 10) - 1)) ,49), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 61), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 10), vdupq_n_u64((1ULL << 54) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 928); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 5) - 1)) ,54), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 62), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 5), vdupq_n_u64((1ULL << 59) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 63), tmp_dbl); + } + } + static void falp_60bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] uint64x2_t register_0; + [[maybe_unused]] uint64x2_t tmp_0; + [[maybe_unused]] int64x2_t base_0 = vmovq_n_u64(*(a_base_p)); + [[maybe_unused]] int64x2_t factor = vmovq_n_u64(alp::FACT_ARR[fac]); + [[maybe_unused]] float64x2_t frac10 = vmovq_n_f64(alp::Constants::FRAC_ARR[exp]); + [[maybe_unused]] float64x2_t tmp_dbl; + [[maybe_unused]] int64x2_t tmp_int; + for (int i = 0; i < 8; ++i) + { + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 0); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 60) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 0), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 16); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 56) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 1), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 32); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 52) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 2), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 48); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 48) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 3), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 64); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 44) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 4), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 20) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 80); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 40) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 5), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 96); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 36) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 6), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 28) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 112); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 7), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 128); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 28) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 8), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 36) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 144); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 9), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 40) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 160); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 20) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 10), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 44) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 176); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,44), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 11), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 48) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 192); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,48), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 12), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 52) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 208); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,52), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 13), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 56) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 224); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,56), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 14), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 60) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 15), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 240); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 60) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 16), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 256); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 56) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 17), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 272); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 52) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 18), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 288); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 48) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 19), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 304); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 44) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 20), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 20) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 320); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 40) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 21), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 336); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 36) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 22), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 28) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 352); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 23), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 368); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 28) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 24), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 36) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 384); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 25), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 40) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 400); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 20) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 26), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 44) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 416); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,44), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 27), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 48) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 432); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,48), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 28), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 52) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 448); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,52), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 29), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 56) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 464); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,56), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 30), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 60) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 31), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 480); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 60) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 32), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 496); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 56) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 33), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 512); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 52) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 34), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 528); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 48) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 35), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 544); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 44) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 36), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 20) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 560); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 40) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 37), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 576); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 36) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 38), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 28) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 592); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 39), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 608); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 28) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 40), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 36) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 624); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 41), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 40) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 640); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 20) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 42), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 44) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 656); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,44), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 43), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 48) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 672); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,48), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 44), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 52) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 688); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,52), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 45), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 56) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 704); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,56), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 46), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 60) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 47), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 720); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 60) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 48), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 736); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 56) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 49), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 752); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 52) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 50), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 768); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 48) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 51), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 784); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 44) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 52), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 20) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 800); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 40) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 53), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 816); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 36) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 54), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 28) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 832); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 55), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 848); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 28) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 56), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 36) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 864); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 57), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 40) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 880); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 20) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 58), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 44) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 896); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,44), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 59), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 48) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 912); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,48), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 60), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 52) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 928); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,52), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 61), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 56) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 944); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,56), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 62), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 60) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 63), tmp_dbl); + } + } + static void falp_61bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] uint64x2_t register_0; + [[maybe_unused]] uint64x2_t tmp_0; + [[maybe_unused]] int64x2_t base_0 = vmovq_n_u64(*(a_base_p)); + [[maybe_unused]] int64x2_t factor = vmovq_n_u64(alp::FACT_ARR[fac]); + [[maybe_unused]] float64x2_t frac10 = vmovq_n_f64(alp::Constants::FRAC_ARR[exp]); + [[maybe_unused]] float64x2_t tmp_dbl; + [[maybe_unused]] int64x2_t tmp_int; + for (int i = 0; i < 8; ++i) + { + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 0); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 61) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 0), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 61), vdupq_n_u64((1ULL << 3) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 16); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 58) - 1)) ,3), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 1), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 58), vdupq_n_u64((1ULL << 6) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 32); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 55) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 2), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 55), vdupq_n_u64((1ULL << 9) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 48); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 52) - 1)) ,9), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 3), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 64); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 49) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 4), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 49), vdupq_n_u64((1ULL << 15) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 80); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 46) - 1)) ,15), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 5), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 46), vdupq_n_u64((1ULL << 18) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 96); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 43) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 6), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 43), vdupq_n_u64((1ULL << 21) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 112); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 40) - 1)) ,21), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 7), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 128); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 37) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 8), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 37), vdupq_n_u64((1ULL << 27) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 144); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 34) - 1)) ,27), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 9), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 34), vdupq_n_u64((1ULL << 30) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 160); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 31) - 1)) ,30), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 10), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 31), vdupq_n_u64((1ULL << 33) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 176); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 28) - 1)) ,33), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 11), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 36) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 192); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 25) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 12), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 25), vdupq_n_u64((1ULL << 39) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 208); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 22) - 1)) ,39), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 13), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 22), vdupq_n_u64((1ULL << 42) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 224); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 19) - 1)) ,42), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 14), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 19), vdupq_n_u64((1ULL << 45) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 240); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,45), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 15), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 48) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 256); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 13) - 1)) ,48), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 16), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 13), vdupq_n_u64((1ULL << 51) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 272); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 10) - 1)) ,51), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 17), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 10), vdupq_n_u64((1ULL << 54) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 288); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 7) - 1)) ,54), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 18), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 7), vdupq_n_u64((1ULL << 57) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 304); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,57), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 19), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 60) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 320); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 1) - 1)) ,60), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 20), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 1), vdupq_n_u64((1ULL << 61) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 21), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 62), vdupq_n_u64((1ULL << 2) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 336); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 59) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 22), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 59), vdupq_n_u64((1ULL << 5) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 352); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 56) - 1)) ,5), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 23), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 368); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 53) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 24), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 53), vdupq_n_u64((1ULL << 11) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 384); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 50) - 1)) ,11), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 25), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 50), vdupq_n_u64((1ULL << 14) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 400); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 47) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 26), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 47), vdupq_n_u64((1ULL << 17) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 416); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 44) - 1)) ,17), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 27), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 20) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 432); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 41) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 28), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 41), vdupq_n_u64((1ULL << 23) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 448); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 38) - 1)) ,23), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 29), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 38), vdupq_n_u64((1ULL << 26) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 464); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 35) - 1)) ,26), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 30), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 35), vdupq_n_u64((1ULL << 29) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 480); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,29), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 31), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 496); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 29) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 32), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 29), vdupq_n_u64((1ULL << 35) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 512); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 26) - 1)) ,35), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 33), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 26), vdupq_n_u64((1ULL << 38) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 528); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 23) - 1)) ,38), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 34), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 23), vdupq_n_u64((1ULL << 41) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 544); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 20) - 1)) ,41), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 35), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 44) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 560); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 17) - 1)) ,44), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 36), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 17), vdupq_n_u64((1ULL << 47) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 576); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 14) - 1)) ,47), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 37), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 14), vdupq_n_u64((1ULL << 50) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 592); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 11) - 1)) ,50), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 38), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 11), vdupq_n_u64((1ULL << 53) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 608); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,53), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 39), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 56) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 624); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 5) - 1)) ,56), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 40), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 5), vdupq_n_u64((1ULL << 59) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 640); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 2) - 1)) ,59), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 41), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 2), vdupq_n_u64((1ULL << 61) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 42), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 63), vdupq_n_u64((1ULL << 1) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 656); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 60) - 1)) ,1), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 43), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 672); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 57) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 44), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 57), vdupq_n_u64((1ULL << 7) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 688); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 54) - 1)) ,7), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 45), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 54), vdupq_n_u64((1ULL << 10) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 704); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 51) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 46), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 51), vdupq_n_u64((1ULL << 13) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 720); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 48) - 1)) ,13), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 47), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 736); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 45) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 48), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 45), vdupq_n_u64((1ULL << 19) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 752); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 42) - 1)) ,19), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 49), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 42), vdupq_n_u64((1ULL << 22) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 768); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 39) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 50), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 39), vdupq_n_u64((1ULL << 25) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 784); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 36) - 1)) ,25), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 51), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 28) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 800); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 33) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 52), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 33), vdupq_n_u64((1ULL << 31) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 816); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 30) - 1)) ,31), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 53), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 30), vdupq_n_u64((1ULL << 34) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 832); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 27) - 1)) ,34), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 54), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 27), vdupq_n_u64((1ULL << 37) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 848); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,37), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 55), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 40) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 864); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 21) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 56), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 21), vdupq_n_u64((1ULL << 43) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 880); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 18) - 1)) ,43), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 57), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 18), vdupq_n_u64((1ULL << 46) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 896); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 15) - 1)) ,46), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 58), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 15), vdupq_n_u64((1ULL << 49) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 912); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,49), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 59), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 52) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 928); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 9) - 1)) ,52), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 60), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 9), vdupq_n_u64((1ULL << 55) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 944); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 6) - 1)) ,55), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 61), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 6), vdupq_n_u64((1ULL << 58) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 960); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 3) - 1)) ,58), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 62), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 3), vdupq_n_u64((1ULL << 61) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 63), tmp_dbl); + } + } + static void falp_62bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] uint64x2_t register_0; + [[maybe_unused]] uint64x2_t tmp_0; + [[maybe_unused]] int64x2_t base_0 = vmovq_n_u64(*(a_base_p)); + [[maybe_unused]] int64x2_t factor = vmovq_n_u64(alp::FACT_ARR[fac]); + [[maybe_unused]] float64x2_t frac10 = vmovq_n_f64(alp::Constants::FRAC_ARR[exp]); + [[maybe_unused]] float64x2_t tmp_dbl; + [[maybe_unused]] int64x2_t tmp_int; + for (int i = 0; i < 8; ++i) + { + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 0); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 62) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 0), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 62), vdupq_n_u64((1ULL << 2) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 16); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 60) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 1), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 32); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 58) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 2), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 58), vdupq_n_u64((1ULL << 6) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 48); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 56) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 3), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 64); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 54) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 4), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 54), vdupq_n_u64((1ULL << 10) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 80); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 52) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 5), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 96); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 50) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 6), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 50), vdupq_n_u64((1ULL << 14) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 112); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 48) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 7), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 128); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 46) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 8), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 46), vdupq_n_u64((1ULL << 18) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 144); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 44) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 9), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 20) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 160); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 42) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 10), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 42), vdupq_n_u64((1ULL << 22) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 176); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 40) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 11), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 192); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 38) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 12), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 38), vdupq_n_u64((1ULL << 26) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 208); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 36) - 1)) ,26), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 13), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 28) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 224); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 34) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 14), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 34), vdupq_n_u64((1ULL << 30) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 240); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,30), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 15), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 256); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 30) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 16), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 30), vdupq_n_u64((1ULL << 34) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 272); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 28) - 1)) ,34), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 17), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 36) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 288); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 26) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 18), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 26), vdupq_n_u64((1ULL << 38) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 304); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,38), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 19), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 40) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 320); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 22) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 20), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 22), vdupq_n_u64((1ULL << 42) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 336); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 20) - 1)) ,42), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 21), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 44) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 352); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 18) - 1)) ,44), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 22), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 18), vdupq_n_u64((1ULL << 46) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 368); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,46), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 23), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 48) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 384); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 14) - 1)) ,48), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 24), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 14), vdupq_n_u64((1ULL << 50) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 400); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,50), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 25), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 52) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 416); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 10) - 1)) ,52), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 26), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 10), vdupq_n_u64((1ULL << 54) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 432); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,54), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 27), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 56) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 448); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 6) - 1)) ,56), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 28), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 6), vdupq_n_u64((1ULL << 58) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 464); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,58), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 29), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 60) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 480); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 2) - 1)) ,60), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 30), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 2), vdupq_n_u64((1ULL << 62) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 31), tmp_dbl); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 496); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 62) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 32), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 62), vdupq_n_u64((1ULL << 2) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 512); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 60) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 33), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 528); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 58) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 34), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 58), vdupq_n_u64((1ULL << 6) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 544); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 56) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 35), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 560); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 54) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 36), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 54), vdupq_n_u64((1ULL << 10) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 576); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 52) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 37), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 592); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 50) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 38), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 50), vdupq_n_u64((1ULL << 14) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 608); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 48) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 39), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 624); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 46) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 40), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 46), vdupq_n_u64((1ULL << 18) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 640); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 44) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 41), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 20) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 656); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 42) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 42), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 42), vdupq_n_u64((1ULL << 22) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 672); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 40) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 43), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 688); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 38) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 44), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 38), vdupq_n_u64((1ULL << 26) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 704); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 36) - 1)) ,26), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 45), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 28) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 720); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 34) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 46), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 34), vdupq_n_u64((1ULL << 30) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 736); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,30), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 47), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 752); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 30) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 48), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 30), vdupq_n_u64((1ULL << 34) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 768); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 28) - 1)) ,34), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 49), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 36) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 784); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 26) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 50), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 26), vdupq_n_u64((1ULL << 38) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 800); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,38), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 51), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 40) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 816); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 22) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 52), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 22), vdupq_n_u64((1ULL << 42) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 832); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 20) - 1)) ,42), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 53), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 44) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 848); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 18) - 1)) ,44), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 54), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 18), vdupq_n_u64((1ULL << 46) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 864); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,46), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 55), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 48) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 880); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 14) - 1)) ,48), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 56), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 14), vdupq_n_u64((1ULL << 50) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 896); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,50), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 57), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 52) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 912); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 10) - 1)) ,52), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 58), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 10), vdupq_n_u64((1ULL << 54) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 928); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,54), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 59), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 56) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 944); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 6) - 1)) ,56), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 60), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 6), vdupq_n_u64((1ULL << 58) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 960); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,58), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 61), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 60) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 976); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 2) - 1)) ,60), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 62), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 2), vdupq_n_u64((1ULL << 62) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 63), tmp_dbl); + } + } + static void falp_63bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] uint64x2_t register_0; + [[maybe_unused]] uint64x2_t tmp_0; + [[maybe_unused]] int64x2_t base_0 = vmovq_n_u64(*(a_base_p)); + [[maybe_unused]] int64x2_t factor = vmovq_n_u64(alp::FACT_ARR[fac]); + [[maybe_unused]] float64x2_t frac10 = vmovq_n_f64(alp::Constants::FRAC_ARR[exp]); + [[maybe_unused]] float64x2_t tmp_dbl; + [[maybe_unused]] int64x2_t tmp_int; + for (int i = 0; i < 8; ++i) + { + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 0); + tmp_0 = vandq_u64(register_0, vdupq_n_u64((1ULL << 63) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 0), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 63), vdupq_n_u64((1ULL << 1) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 16); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 62) - 1)) ,1), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 1), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 62), vdupq_n_u64((1ULL << 2) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 32); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 61) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 2), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 61), vdupq_n_u64((1ULL << 3) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 48); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 60) - 1)) ,3), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 3), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 60), vdupq_n_u64((1ULL << 4) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 64); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 59) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 4), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 59), vdupq_n_u64((1ULL << 5) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 80); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 58) - 1)) ,5), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 5), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 58), vdupq_n_u64((1ULL << 6) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 96); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 57) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 6), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 57), vdupq_n_u64((1ULL << 7) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 112); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 56) - 1)) ,7), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 7), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 56), vdupq_n_u64((1ULL << 8) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 128); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 55) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 8), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 55), vdupq_n_u64((1ULL << 9) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 144); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 54) - 1)) ,9), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 9), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 54), vdupq_n_u64((1ULL << 10) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 160); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 53) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 10), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 53), vdupq_n_u64((1ULL << 11) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 176); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 52) - 1)) ,11), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 11), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 52), vdupq_n_u64((1ULL << 12) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 192); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 51) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 12), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 51), vdupq_n_u64((1ULL << 13) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 208); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 50) - 1)) ,13), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 13), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 50), vdupq_n_u64((1ULL << 14) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 224); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 49) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 14), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 49), vdupq_n_u64((1ULL << 15) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 240); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 48) - 1)) ,15), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 15), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 48), vdupq_n_u64((1ULL << 16) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 256); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 47) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 16), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 47), vdupq_n_u64((1ULL << 17) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 272); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 46) - 1)) ,17), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 17), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 46), vdupq_n_u64((1ULL << 18) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 288); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 45) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 18), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 45), vdupq_n_u64((1ULL << 19) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 304); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 44) - 1)) ,19), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 19), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 44), vdupq_n_u64((1ULL << 20) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 320); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 43) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 20), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 43), vdupq_n_u64((1ULL << 21) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 336); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 42) - 1)) ,21), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 21), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 42), vdupq_n_u64((1ULL << 22) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 352); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 41) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 22), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 41), vdupq_n_u64((1ULL << 23) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 368); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 40) - 1)) ,23), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 23), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 40), vdupq_n_u64((1ULL << 24) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 384); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 39) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 24), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 39), vdupq_n_u64((1ULL << 25) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 400); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 38) - 1)) ,25), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 25), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 38), vdupq_n_u64((1ULL << 26) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 416); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 37) - 1)) ,26), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 26), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 37), vdupq_n_u64((1ULL << 27) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 432); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 36) - 1)) ,27), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 27), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 36), vdupq_n_u64((1ULL << 28) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 448); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 35) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 28), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 35), vdupq_n_u64((1ULL << 29) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 464); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 34) - 1)) ,29), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 29), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 34), vdupq_n_u64((1ULL << 30) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 480); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 33) - 1)) ,30), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 30), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 33), vdupq_n_u64((1ULL << 31) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 496); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 32) - 1)) ,31), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 31), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 32), vdupq_n_u64((1ULL << 32) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 512); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 31) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 32), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 31), vdupq_n_u64((1ULL << 33) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 528); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 30) - 1)) ,33), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 33), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 30), vdupq_n_u64((1ULL << 34) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 544); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 29) - 1)) ,34), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 34), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 29), vdupq_n_u64((1ULL << 35) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 560); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 28) - 1)) ,35), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 35), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 28), vdupq_n_u64((1ULL << 36) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 576); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 27) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 36), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 27), vdupq_n_u64((1ULL << 37) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 592); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 26) - 1)) ,37), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 37), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 26), vdupq_n_u64((1ULL << 38) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 608); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 25) - 1)) ,38), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 38), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 25), vdupq_n_u64((1ULL << 39) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 624); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 24) - 1)) ,39), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 39), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 24), vdupq_n_u64((1ULL << 40) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 640); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 23) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 40), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 23), vdupq_n_u64((1ULL << 41) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 656); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 22) - 1)) ,41), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 41), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 22), vdupq_n_u64((1ULL << 42) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 672); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 21) - 1)) ,42), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 42), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 21), vdupq_n_u64((1ULL << 43) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 688); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 20) - 1)) ,43), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 43), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 20), vdupq_n_u64((1ULL << 44) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 704); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 19) - 1)) ,44), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 44), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 19), vdupq_n_u64((1ULL << 45) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 720); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 18) - 1)) ,45), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 45), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 18), vdupq_n_u64((1ULL << 46) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 736); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 17) - 1)) ,46), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 46), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 17), vdupq_n_u64((1ULL << 47) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 752); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 16) - 1)) ,47), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 47), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 16), vdupq_n_u64((1ULL << 48) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 768); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 15) - 1)) ,48), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 48), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 15), vdupq_n_u64((1ULL << 49) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 784); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 14) - 1)) ,49), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 49), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 14), vdupq_n_u64((1ULL << 50) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 800); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 13) - 1)) ,50), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 50), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 13), vdupq_n_u64((1ULL << 51) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 816); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 12) - 1)) ,51), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 51), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 12), vdupq_n_u64((1ULL << 52) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 832); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 11) - 1)) ,52), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 52), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 11), vdupq_n_u64((1ULL << 53) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 848); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 10) - 1)) ,53), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 53), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 10), vdupq_n_u64((1ULL << 54) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 864); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 9) - 1)) ,54), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 54), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 9), vdupq_n_u64((1ULL << 55) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 880); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 8) - 1)) ,55), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 55), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 8), vdupq_n_u64((1ULL << 56) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 896); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 7) - 1)) ,56), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 56), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 7), vdupq_n_u64((1ULL << 57) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 912); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 6) - 1)) ,57), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 57), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 6), vdupq_n_u64((1ULL << 58) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 928); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 5) - 1)) ,58), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 58), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 5), vdupq_n_u64((1ULL << 59) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 944); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 4) - 1)) ,59), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 59), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 4), vdupq_n_u64((1ULL << 60) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 960); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 3) - 1)) ,60), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 60), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 3), vdupq_n_u64((1ULL << 61) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 976); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 2) - 1)) ,61), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 61), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 2), vdupq_n_u64((1ULL << 62) - 1)); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 992); + tmp_0 = vorrq_u64(vshlq_n_u64(vandq_u64(register_0, vdupq_n_u64((1ULL << 1) - 1)) ,62), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 62), tmp_dbl); + tmp_0 = vandq_u64(vshrq_n_u64(register_0, 1), vdupq_n_u64((1ULL << 63) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + tmp_dbl *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 63), tmp_dbl); + } + } + static void falp_64bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] uint64x2_t register_0; + [[maybe_unused]] uint64x2_t tmp_0; + [[maybe_unused]] int64x2_t base_0 = vmovq_n_u64(*(a_base_p)); + [[maybe_unused]] int64x2_t factor = vmovq_n_u64(alp::FACT_ARR[fac]); + [[maybe_unused]] float64x2_t frac10 = vmovq_n_f64(alp::Constants::FRAC_ARR[exp]); + [[maybe_unused]] float64x2_t tmp_dbl; + [[maybe_unused]] int64x2_t tmp_int; + for (int i = 0; i < 8; ++i) + { + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 0); + register_0 += base_0; + register_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + register_0 *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 0), register_0); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 16); + register_0 += base_0; + register_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + register_0 *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 1), register_0); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 32); + register_0 += base_0; + register_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + register_0 *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 2), register_0); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 48); + register_0 += base_0; + register_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + register_0 *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 3), register_0); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 64); + register_0 += base_0; + register_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + register_0 *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 4), register_0); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 80); + register_0 += base_0; + register_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + register_0 *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 5), register_0); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 96); + register_0 += base_0; + register_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + register_0 *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 6), register_0); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 112); + register_0 += base_0; + register_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + register_0 *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 7), register_0); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 128); + register_0 += base_0; + register_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + register_0 *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 8), register_0); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 144); + register_0 += base_0; + register_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + register_0 *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 9), register_0); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 160); + register_0 += base_0; + register_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + register_0 *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 10), register_0); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 176); + register_0 += base_0; + register_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + register_0 *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 11), register_0); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 192); + register_0 += base_0; + register_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + register_0 *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 12), register_0); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 208); + register_0 += base_0; + register_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + register_0 *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 13), register_0); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 224); + register_0 += base_0; + register_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + register_0 *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 14), register_0); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 240); + register_0 += base_0; + register_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + register_0 *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 15), register_0); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 256); + register_0 += base_0; + register_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + register_0 *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 16), register_0); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 272); + register_0 += base_0; + register_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + register_0 *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 17), register_0); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 288); + register_0 += base_0; + register_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + register_0 *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 18), register_0); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 304); + register_0 += base_0; + register_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + register_0 *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 19), register_0); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 320); + register_0 += base_0; + register_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + register_0 *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 20), register_0); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 336); + register_0 += base_0; + register_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + register_0 *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 21), register_0); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 352); + register_0 += base_0; + register_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + register_0 *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 22), register_0); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 368); + register_0 += base_0; + register_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + register_0 *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 23), register_0); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 384); + register_0 += base_0; + register_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + register_0 *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 24), register_0); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 400); + register_0 += base_0; + register_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + register_0 *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 25), register_0); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 416); + register_0 += base_0; + register_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + register_0 *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 26), register_0); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 432); + register_0 += base_0; + register_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + register_0 *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 27), register_0); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 448); + register_0 += base_0; + register_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + register_0 *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 28), register_0); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 464); + register_0 += base_0; + register_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + register_0 *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 29), register_0); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 480); + register_0 += base_0; + register_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + register_0 *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 30), register_0); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 496); + register_0 += base_0; + register_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + register_0 *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 31), register_0); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 512); + register_0 += base_0; + register_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + register_0 *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 32), register_0); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 528); + register_0 += base_0; + register_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + register_0 *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 33), register_0); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 544); + register_0 += base_0; + register_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + register_0 *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 34), register_0); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 560); + register_0 += base_0; + register_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + register_0 *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 35), register_0); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 576); + register_0 += base_0; + register_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + register_0 *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 36), register_0); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 592); + register_0 += base_0; + register_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + register_0 *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 37), register_0); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 608); + register_0 += base_0; + register_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + register_0 *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 38), register_0); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 624); + register_0 += base_0; + register_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + register_0 *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 39), register_0); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 640); + register_0 += base_0; + register_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + register_0 *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 40), register_0); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 656); + register_0 += base_0; + register_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + register_0 *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 41), register_0); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 672); + register_0 += base_0; + register_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + register_0 *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 42), register_0); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 688); + register_0 += base_0; + register_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + register_0 *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 43), register_0); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 704); + register_0 += base_0; + register_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + register_0 *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 44), register_0); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 720); + register_0 += base_0; + register_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + register_0 *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 45), register_0); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 736); + register_0 += base_0; + register_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + register_0 *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 46), register_0); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 752); + register_0 += base_0; + register_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + register_0 *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 47), register_0); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 768); + register_0 += base_0; + register_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + register_0 *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 48), register_0); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 784); + register_0 += base_0; + register_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + register_0 *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 49), register_0); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 800); + register_0 += base_0; + register_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + register_0 *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 50), register_0); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 816); + register_0 += base_0; + register_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + register_0 *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 51), register_0); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 832); + register_0 += base_0; + register_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + register_0 *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 52), register_0); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 848); + register_0 += base_0; + register_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + register_0 *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 53), register_0); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 864); + register_0 += base_0; + register_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + register_0 *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 54), register_0); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 880); + register_0 += base_0; + register_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + register_0 *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 55), register_0); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 896); + register_0 += base_0; + register_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + register_0 *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 56), register_0); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 912); + register_0 += base_0; + register_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + register_0 *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 57), register_0); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 928); + register_0 += base_0; + register_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + register_0 *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 58), register_0); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 944); + register_0 += base_0; + register_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + register_0 *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 59), register_0); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 960); + register_0 += base_0; + register_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + register_0 *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 60), register_0); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 976); + register_0 += base_0; + register_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + register_0 *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 61), register_0); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 992); + register_0 += base_0; + register_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + register_0 *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 62), register_0); + register_0 = vld1q_u64(in + (0 * 8 * 2) + (i * 2) + 1008); + register_0 += base_0; + register_0 *= factor; + tmp_dbl = vcvtq_f64_s64(tmp_0); + register_0 *= frac10; + vst1q_f64(out + (i * 2) + (0 * 8 * 2) + (16 * 63), register_0); + } + } + void falp(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, uint8_t bw, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + switch (bw) + { + case 0: + falp_0bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 1: + falp_1bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 2: + falp_2bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 3: + falp_3bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 4: + falp_4bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 5: + falp_5bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 6: + falp_6bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 7: + falp_7bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 8: + falp_8bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 9: + falp_9bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 10: + falp_10bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 11: + falp_11bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 12: + falp_12bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 13: + falp_13bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 14: + falp_14bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 15: + falp_15bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 16: + falp_16bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 17: + falp_17bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 18: + falp_18bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 19: + falp_19bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 20: + falp_20bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 21: + falp_21bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 22: + falp_22bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 23: + falp_23bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 24: + falp_24bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 25: + falp_25bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 26: + falp_26bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 27: + falp_27bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 28: + falp_28bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 29: + falp_29bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 30: + falp_30bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 31: + falp_31bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 32: + falp_32bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 33: + falp_33bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 34: + falp_34bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 35: + falp_35bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 36: + falp_36bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 37: + falp_37bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 38: + falp_38bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 39: + falp_39bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 40: + falp_40bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 41: + falp_41bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 42: + falp_42bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 43: + falp_43bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 44: + falp_44bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 45: + falp_45bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 46: + falp_46bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 47: + falp_47bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 48: + falp_48bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 49: + falp_49bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 50: + falp_50bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 51: + falp_51bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 52: + falp_52bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 53: + falp_53bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 54: + falp_54bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 55: + falp_55bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 56: + falp_56bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 57: + falp_57bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 58: + falp_58bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 59: + falp_59bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 60: + falp_60bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 61: + falp_61bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 62: + falp_62bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 63: + falp_63bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 64: + falp_64bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + } + } + } + } +} +; diff --git a/generated/arm64v8/neon_intrinsic_uf1/arm64v8_neon_intrinsic_1024_uf1_falp_test.cpp b/generated/arm64v8/neon_intrinsic_uf1/arm64v8_neon_intrinsic_1024_uf1_falp_test.cpp new file mode 100644 index 0000000..acf6e46 --- /dev/null +++ b/generated/arm64v8/neon_intrinsic_uf1/arm64v8_neon_intrinsic_1024_uf1_falp_test.cpp @@ -0,0 +1,119 @@ +#include "alp.hpp" +#include "data.hpp" +#include "gtest/gtest.h" +#include + +class arm64v8_neon_intrinsic_1024_uf1_falp: public ::testing::Test +{ + public: + double * dbl_arr; + double * exc_arr; + uint16_t * pos_arr; + uint16_t * exc_c_arr; + int64_t * ffor_arr; + int64_t * unffor_arr; + int64_t * base_arr; + int64_t * dig_arr; + double * dec_dbl_arr; + uint8_t bw; + uint8_t factor; + uint8_t exponent; + double * smp_arr; + void SetUp() override + { + dbl_arr = new double[1024]; + exc_arr = new double[1024]; + pos_arr = new uint16_t[1024]; + dig_arr = new int64_t[1024]; + dec_dbl_arr = new double[1024]; + exc_c_arr = new uint16_t[1024]; + ffor_arr = new int64_t[1024]; + unffor_arr = new int64_t[1024]; + base_arr = new int64_t[1024]; + smp_arr = new double[1024]; + } + ~arm64v8_neon_intrinsic_1024_uf1_falp () override + { + delete[] dbl_arr; + delete[] exc_arr; + delete[] pos_arr; + delete[] dig_arr; + delete[] dec_dbl_arr; + delete[] exc_c_arr; + delete[] ffor_arr; + delete[] unffor_arr; + delete[] base_arr; + delete[] smp_arr; + } +} +; +TEST_F(arm64v8_neon_intrinsic_1024_uf1_falp, fused) +{ + for (auto & dataset : alp_bench::alp_dataset) + { + std:: ifstream ifile(dataset.sample_csv_file_path, std::ios:: in ); + ASSERT_EQ(ifile.fail(), false); + alp::state stt; + if (dataset.suitable_for_cutting) { continue; } + if (dataset.name.find("bw") != std::string::npos) { continue; } + double num = 0.0; + size_t c {0}; + while (ifile >> num) + { + dbl_arr[c] = num; + c = c + 1; + } + // Init + alp::AlpEncode::init(dbl_arr, 0, 1024, smp_arr, stt); + // Encode + alp::AlpEncode::encode(dbl_arr, exc_arr, pos_arr, exc_c_arr, dig_arr, stt); + alp::AlpEncode::analyze_ffor(dig_arr, bw, base_arr); + fastlanes::generated::ffor::fallback::scalar::ffor(dig_arr, ffor_arr, bw, base_arr); + // Decode + generated::falp::arm64v8::neon::falp(reinterpret_cast < uint64_t * > (ffor_arr), dec_dbl_arr, bw, reinterpret_cast < uint64_t * > (base_arr),stt.fac, stt.exp); + alp::AlpDecode::patch_exceptions(dec_dbl_arr, exc_arr, pos_arr, exc_c_arr); + for (size_t i = 0; i < 1024; ++i) + { + ASSERT_EQ(dbl_arr[i], dec_dbl_arr[i]); + } + ASSERT_EQ(dataset.exceptions_count, exc_c_arr[0]); + ASSERT_EQ(dataset.bit_width, bw); + ifile.close(); + } +} + +TEST_F(arm64v8_neon_intrinsic_1024_uf1_falp, unfused) +{ + for (auto & dataset : alp_bench::alp_dataset) + { + std:: ifstream ifile(dataset.sample_csv_file_path, std::ios:: in ); + ASSERT_EQ(ifile.fail(), false); + alp::state stt; + if (dataset.suitable_for_cutting) { continue; } + if (dataset.name.find("bw") != std::string::npos) { continue; } + double num = 0.0; + size_t c {0}; + while (ifile >> num) + { + dbl_arr[c] = num; + c = c + 1; + } + // Init + alp::AlpEncode::init(dbl_arr, 0, 1024, smp_arr, stt); + // Encode + alp::AlpEncode::encode(dbl_arr, exc_arr, pos_arr, exc_c_arr, dig_arr, stt); + alp::AlpEncode::analyze_ffor(dig_arr, bw, base_arr); + fastlanes::generated::ffor::fallback::scalar::ffor(dig_arr, ffor_arr, bw, base_arr); + // Decode + fastlanes::generated::unffor::fallback::scalar::unffor(ffor_arr, unffor_arr, bw, base_arr); + alp::AlpDecode::decode(unffor_arr, stt.fac, stt.exp, dec_dbl_arr); + alp::AlpDecode::patch_exceptions(dec_dbl_arr, exc_arr, pos_arr, exc_c_arr); + for (size_t i = 0; i < 1024; ++i) + { + ASSERT_EQ(dbl_arr[i], dec_dbl_arr[i]); + } + ASSERT_EQ(dataset.exceptions_count, exc_c_arr[0]); + ASSERT_EQ(dataset.bit_width, bw); + ifile.close(); + } +} diff --git a/generated/arm64v8/neon_intrinsic_uf1/falp.cmake b/generated/arm64v8/neon_intrinsic_uf1/falp.cmake new file mode 100644 index 0000000..e9e57ce --- /dev/null +++ b/generated/arm64v8/neon_intrinsic_uf1/falp.cmake @@ -0,0 +1,32 @@ +add_library(arm64v8_neon_intrinsic_1024_uf1_falp OBJECT + arm64v8_neon_intrinsic_1024_uf1_falp_src.cpp) +target_compile_definitions(arm64v8_neon_intrinsic_1024_uf1_falp PRIVATE IS_SCALAR) +set(FLAG -O3) +check_cxx_compiler_flag(${FLAG} HAS_FLAG) +if (HAS_FLAG) +else () + message(STATUS "The flag ${FLAG} is not supported by the current compiler") +endif () +target_compile_options(arm64v8_neon_intrinsic_1024_uf1_falp PUBLIC ${FLAG}) +cmake_print_properties(TARGETS arm64v8_neon_intrinsic_1024_uf1_falp + PROPERTIES COMPILE_DEFINITIONS + PROPERTIES COMPILE_OPTIONS) +LIST(APPEND ALP_GENERATED_OBJECT_FILES + $) +get_target_property(TARGET_NAME arm64v8_neon_intrinsic_1024_uf1_falp NAME) +get_target_property(TARGET_COMPILE_OPTIONS arm64v8_neon_intrinsic_1024_uf1_falp COMPILE_OPTIONS) +#------------------------------------------------------------------------------------------------------ +if (BUILD_TESTING) + add_executable(arm64v8_neon_intrinsic_1024_uf1_falp_test arm64v8_neon_intrinsic_1024_uf1_falp_test.cpp) + target_link_libraries(arm64v8_neon_intrinsic_1024_uf1_falp_test PRIVATE ALP gtest_main arm64v8_neon_intrinsic_1024_uf1_falp) + target_include_directories(arm64v8_neon_intrinsic_1024_uf1_falp_test PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) + gtest_discover_tests(arm64v8_neon_intrinsic_1024_uf1_falp_test) +endif () +#------------------------------------------------------------------------------------------------------ +if (BUILD_BENCHMARK) + configure_file(${CMAKE_SOURCE_DIR}/alp_bench/alp_bench.hpp ${CMAKE_CURRENT_BINARY_DIR}/arm64v8_neon_intrinsic_1024_uf1_falp_bench.hpp) + add_executable(arm64v8_neon_intrinsic_1024_uf1_falp_bench arm64v8_neon_intrinsic_1024_uf1_falp_bench.cpp) + target_link_libraries(arm64v8_neon_intrinsic_1024_uf1_falp_bench PRIVATE ALP arm64v8_neon_intrinsic_1024_uf1_falp) + target_include_directories(arm64v8_neon_intrinsic_1024_uf1_falp_bench PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) + add_alp_benchmark(arm64v8_neon_intrinsic_1024_uf1_falp_bench) +endif () diff --git a/generated/arm64v8/sve_intrinsic_uf1/CMakeLists.txt b/generated/arm64v8/sve_intrinsic_uf1/CMakeLists.txt new file mode 100644 index 0000000..2b9ff8a --- /dev/null +++ b/generated/arm64v8/sve_intrinsic_uf1/CMakeLists.txt @@ -0,0 +1,50 @@ +if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/unpack.cmake") + include(${CMAKE_CURRENT_SOURCE_DIR}/unpack.cmake) +else() +endif() +if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/ut.cmake") + include(${CMAKE_CURRENT_SOURCE_DIR}/ut.cmake) +else() +endif() +if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/normal.cmake") + include(${CMAKE_CURRENT_SOURCE_DIR}/normal.cmake) +else() +endif() +if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/pack.cmake") + include(${CMAKE_CURRENT_SOURCE_DIR}/pack.cmake) +else() +endif() +if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/unffor.cmake") + include(${CMAKE_CURRENT_SOURCE_DIR}/unffor.cmake) +else() +endif() +if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/ffor.cmake") + include(${CMAKE_CURRENT_SOURCE_DIR}/ffor.cmake) +else() +endif() +if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/unrsum.cmake") + include(${CMAKE_CURRENT_SOURCE_DIR}/unrsum.cmake) +else() +endif() +if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/rsum.cmake") + include(${CMAKE_CURRENT_SOURCE_DIR}/rsum.cmake) +else() +endif() +if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/untranspose.cmake") + include(${CMAKE_CURRENT_SOURCE_DIR}/untranspose.cmake) +else() +endif() +if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/transpose.cmake") + include(${CMAKE_CURRENT_SOURCE_DIR}/transpose.cmake) +else() +endif() +if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/rsum_and_untranspose.cmake") + include(${CMAKE_CURRENT_SOURCE_DIR}/rsum_and_untranspose.cmake) +else() +endif() +if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/falp.cmake") + include(${CMAKE_CURRENT_SOURCE_DIR}/falp.cmake) +else() +endif() +set(FLS_GENERATED_OBJECT_FILES + ${FLS_GENERATED_OBJECT_FILES} PARENT_SCOPE) diff --git a/generated/arm64v8/sve_intrinsic_uf1/arm64v8_sve_intrinsic_1024_uf1_falp_bench.cpp b/generated/arm64v8/sve_intrinsic_uf1/arm64v8_sve_intrinsic_1024_uf1_falp_bench.cpp new file mode 100644 index 0000000..81dd937 --- /dev/null +++ b/generated/arm64v8/sve_intrinsic_uf1/arm64v8_sve_intrinsic_1024_uf1_falp_bench.cpp @@ -0,0 +1,128 @@ +#include "arm64v8_sve_intrinsic_1024_uf1_falp_bench.hpp" +#include "alp/alp.hpp" +#include "datasets.hpp" +#include "alp/ffor.hpp" +#include "alp/unffor.hpp" +static __attribute__((noinline)) benchmark::BenchmarkReporter::Run bench_alp_fused_decode(alp_bench::Dataset& dataset, int64_t* ffor_arr, uint8_t bw, int64_t*base_arr,uint8_t factor,uint8_t exponent,double* dec_dbl_arr,double* exc_arr,uint16_t* pos_arr,uint16_t* exc_c_arr) +{ + int benchmark_number = dataset.id; + + #ifdef NDEBUG + uint64_t iterations = 3000000; + #else + uint64_t iterations = 1; + #endif + + std::string benchmark_name = dataset.name + "_fused"; + + uint64_t cycles = benchmark::cycleclock::Now(); + for (uint64_t i = 0; i < iterations; ++i) { + generated::falp::arm64v8::sve::falp(reinterpret_cast(ffor_arr), + dec_dbl_arr, + bw, + reinterpret_cast(base_arr), + factor, + exponent); + alp::AlpDecode::patch_exceptions(dec_dbl_arr, exc_arr, pos_arr, exc_c_arr); + } + + cycles = benchmark::cycleclock::Now() - cycles; + + return benchmark::BenchmarkReporter::Run( + benchmark_number, benchmark_name, iterations, double(cycles) / (double(iterations) * 1024)); +} +static __attribute__((noinline)) benchmark::BenchmarkReporter::Run bench_alp_decode(alp_bench::Dataset& dataset, int64_t* ffor_arr, int64_t* unffor_arr, uint8_t bw, int64_t* base_arr, uint8_t factor, uint8_t exponent, double* dec_dbl_arr, double* exc_arr, uint16_t* pos_arr, uint16_t* exc_c_arr) +{ + + int benchmark_number = dataset.id; + + #ifdef NDEBUG + uint64_t iterations = 3000000; + #else + uint64_t iterations = 1; + #endif + + std::string benchmark_name = dataset.name + ""; + + uint64_t cycles = benchmark::cycleclock::Now(); + for (uint64_t i = 0; i < iterations; ++i) { + alp::generated::unffor::fallback::scalar::unffor(ffor_arr, unffor_arr, bw, base_arr); + alp::AlpDecode(reinterpret_cast(unffor_arr), factor, exponent, dec_dbl_arr); + alp::AlpDecode::patch_exceptions(dec_dbl_arr, exc_arr, pos_arr, exc_c_arr); + } + + cycles = benchmark::cycleclock::Now() - cycles; + + return benchmark::BenchmarkReporter::Run( + benchmark_number, benchmark_name, iterations, double(cycles) / (double(iterations) * 1024)); + +} +void benchmark_all(benchmark::Benchmark& benchmark) +{ + + double* dbl_arr; + double* exc_arr; + uint16_t* pos_arr; + uint16_t* exc_c_arr; + int64_t* ffor_arr; + int64_t* unffor_arr; + + int64_t* base_arr; + int64_t* dig_arr; + double* dec_dbl_arr; + + uint8_t bw; + uint8_t factor; + uint8_t exponent; + + dbl_arr = new (std::align_val_t {64}) double[1024]; + exc_arr = new (std::align_val_t {64}) double[1024]; + pos_arr = new (std::align_val_t {64}) uint16_t[1024]; + dig_arr = new (std::align_val_t {64}) int64_t[1024]; + dec_dbl_arr = new (std::align_val_t {64}) double[1024]; + exc_c_arr = new (std::align_val_t {64}) uint16_t[1024]; + ffor_arr = new (std::align_val_t {64}) int64_t[1024]; + unffor_arr = new (std::align_val_t {64}) int64_t[1024]; + base_arr = new (std::align_val_t {64}) int64_t[1024]; + + for (auto& dataset : alp_bench::datasets) { + std::ifstream ifile(dataset.sample_csv_file_path, std::ios::in); + + // check to see that the file was opened correctly: + if (!ifile.is_open()) { + exit(1); // exit or do additional error checking + } + + double num = 0.0; + // keep storing values from the text file so long as data exists: + size_t c {0}; + while (ifile >> num) { + dbl_arr[c] = num; + c += 1; + } + + factor = dataset.factor; + exponent = dataset.exponent; + + alp::AlpEncode::encode(dbl_arr, exc_arr, pos_arr, exc_c_arr, dig_arr, stt); + alp::AlpEncode::analyze_ffor(dig_arr, bw, base_arr); + alp::generated::ffor::fallback::scalar::ffor(dig_arr, ffor_arr, bw, base_arr); + + benchmark.Run(bench_alp_fused_decode( + dataset, unffor_arr, bw, base_arr, factor, exponent, dec_dbl_arr, exc_arr, pos_arr, exc_c_arr)); + + benchmark.Run(bench_alp_decode( + dataset, ffor_arr, unffor_arr, bw, base_arr, factor, exponent, dec_dbl_arr, exc_arr, pos_arr, exc_c_arr)); + + ifile.close();} +} +int main() +{ + benchmark::Benchmark benchmark = + benchmark::create("arm64v8_sve_intrinsic_1024_uf1_falp") + .save() + .at(std::string(SOURCE_DIR) + "/alp_pub/results/" + benchmark::CmakeInfo::getCmakeToolchainFile()) + .print() + .add_extra_info(benchmark::CmakeInfo::getCmakeInfo()); + benchmark_all(benchmark); +} diff --git a/generated/arm64v8/sve_intrinsic_uf1/arm64v8_sve_intrinsic_1024_uf1_falp_src.cpp b/generated/arm64v8/sve_intrinsic_uf1/arm64v8_sve_intrinsic_1024_uf1_falp_src.cpp new file mode 100644 index 0000000..730f3dc --- /dev/null +++ b/generated/arm64v8/sve_intrinsic_uf1/arm64v8_sve_intrinsic_1024_uf1_falp_src.cpp @@ -0,0 +1,29738 @@ +#include "alp/alp.hpp" +#include "alp/macros.hpp" +#ifdef __ARM_FEATURE_SVE +#include +#else +#include "farm_sve.h" +#endif /* __ARM_FEATURE_SVE */ +namespace generated +{ + namespace falp::arm64v8 + { + namespace sve + { + static void falp_0bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] svuint64_t register_0; + [[maybe_unused]] svuint64_t tmp_0; + svbool_t pg = svwhilelt_b64(static_cast(0LL), (1024LL / 64)); + int64_t i = 0; + [[maybe_unused]] svuint64_t base_0; + do + { + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 0), base_0); + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 1), base_0); + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 2), base_0); + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 3), base_0); + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 4), base_0); + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 5), base_0); + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 6), base_0); + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 7), base_0); + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 8), base_0); + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 9), base_0); + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 10), base_0); + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 11), base_0); + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 12), base_0); + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 13), base_0); + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 14), base_0); + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 15), base_0); + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 16), base_0); + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 17), base_0); + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 18), base_0); + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 19), base_0); + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 20), base_0); + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 21), base_0); + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 22), base_0); + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 23), base_0); + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 24), base_0); + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 25), base_0); + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 26), base_0); + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 27), base_0); + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 28), base_0); + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 29), base_0); + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 30), base_0); + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 31), base_0); + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 32), base_0); + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 33), base_0); + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 34), base_0); + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 35), base_0); + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 36), base_0); + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 37), base_0); + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 38), base_0); + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 39), base_0); + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 40), base_0); + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 41), base_0); + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 42), base_0); + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 43), base_0); + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 44), base_0); + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 45), base_0); + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 46), base_0); + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 47), base_0); + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 48), base_0); + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 49), base_0); + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 50), base_0); + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 51), base_0); + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 52), base_0); + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 53), base_0); + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 54), base_0); + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 55), base_0); + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 56), base_0); + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 57), base_0); + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 58), base_0); + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 59), base_0); + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 60), base_0); + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 61), base_0); + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 62), base_0); + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 63), base_0); + i += svcntd(); + pg = svwhilelt_b64(i, (1024LL / 64)); + } + while (svptest_any(svptrue_b64(), pg)); + } + static void falp_1bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] svuint64_t register_0; + [[maybe_unused]] svuint64_t tmp_0; + svbool_t pg = svwhilelt_b64(static_cast(0LL), (1024LL / 64)); + int64_t i = 0; + [[maybe_unused]] svuint64_t base_0; + do + { + register_0 = svld1(pg, in + (0 * 16) + (i) + 0); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 0), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 1), svdup_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 1), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 2), svdup_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 2), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 3), svdup_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 3), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 4), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 5), svdup_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 5), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 6), svdup_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 6), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 7), svdup_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 7), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 8), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 9), svdup_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 9), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 10), svdup_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 10), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 11), svdup_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 11), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 12), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 13), svdup_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 13), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 14), svdup_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 14), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 15), svdup_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 15), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 16), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 17), svdup_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 17), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 18), svdup_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 18), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 19), svdup_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 19), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 20), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 21), svdup_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 21), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 22), svdup_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 22), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 23), svdup_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 23), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 24), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 25), svdup_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 25), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 26), svdup_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 26), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 27), svdup_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 27), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 28), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 29), svdup_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 29), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 30), svdup_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 30), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 31), svdup_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 31), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 32), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 33), svdup_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 33), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 34), svdup_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 34), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 35), svdup_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 35), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 36), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 37), svdup_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 37), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 38), svdup_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 38), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 39), svdup_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 39), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 40), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 41), svdup_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 41), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 42), svdup_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 42), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 43), svdup_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 43), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 44), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 45), svdup_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 45), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 46), svdup_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 46), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 47), svdup_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 47), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 48), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 49), svdup_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 49), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 50), svdup_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 50), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 51), svdup_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 51), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 52), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 53), svdup_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 53), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 54), svdup_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 54), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 55), svdup_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 55), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 56), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 57), svdup_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 57), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 58), svdup_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 58), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 59), svdup_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 59), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 60), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 61), svdup_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 61), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 62), svdup_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 62), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 63), svdup_u64((1ULL << 1) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 63), tmp_dbl); + i += svcntd(); + pg = svwhilelt_b64(i, (1024LL / 64)); + } + while (svptest_any(svptrue_b64(), pg)); + } + static void falp_2bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] svuint64_t register_0; + [[maybe_unused]] svuint64_t tmp_0; + svbool_t pg = svwhilelt_b64(static_cast(0LL), (1024LL / 64)); + int64_t i = 0; + [[maybe_unused]] svuint64_t base_0; + do + { + register_0 = svld1(pg, in + (0 * 16) + (i) + 0); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 0), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 2), svdup_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 1), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 2), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 6), svdup_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 3), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 4), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 10), svdup_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 5), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 6), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 14), svdup_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 7), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 8), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 18), svdup_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 9), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 10), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 22), svdup_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 11), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 12), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 26), svdup_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 13), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 14), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 30), svdup_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 15), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 16), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 34), svdup_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 17), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 18), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 38), svdup_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 19), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 20), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 42), svdup_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 21), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 22), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 46), svdup_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 23), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 24), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 50), svdup_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 25), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 26), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 54), svdup_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 27), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 28), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 58), svdup_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 29), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 30), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 62), svdup_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 31), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 16); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 32), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 2), svdup_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 33), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 34), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 6), svdup_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 35), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 36), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 10), svdup_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 37), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 38), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 14), svdup_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 39), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 40), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 18), svdup_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 41), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 42), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 22), svdup_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 43), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 44), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 26), svdup_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 45), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 46), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 30), svdup_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 47), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 48), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 34), svdup_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 49), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 50), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 38), svdup_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 51), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 52), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 42), svdup_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 53), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 54), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 46), svdup_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 55), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 56), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 50), svdup_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 57), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 58), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 54), svdup_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 59), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 60), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 58), svdup_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 61), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 62), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 62), svdup_u64((1ULL << 2) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 63), tmp_dbl); + i += svcntd(); + pg = svwhilelt_b64(i, (1024LL / 64)); + } + while (svptest_any(svptrue_b64(), pg)); + } + static void falp_3bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] svuint64_t register_0; + [[maybe_unused]] svuint64_t tmp_0; + svbool_t pg = svwhilelt_b64(static_cast(0LL), (1024LL / 64)); + int64_t i = 0; + [[maybe_unused]] svuint64_t base_0; + do + { + register_0 = svld1(pg, in + (0 * 16) + (i) + 0); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 0), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 3), svdup_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 1), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 6), svdup_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 2), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 9), svdup_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 3), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 4), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 15), svdup_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 5), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 18), svdup_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 6), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 21), svdup_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 7), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 8), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 27), svdup_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 9), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 30), svdup_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 10), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 33), svdup_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 11), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 12), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 39), svdup_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 13), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 42), svdup_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 14), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 45), svdup_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 15), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 16), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 51), svdup_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 17), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 54), svdup_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 18), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 57), svdup_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 19), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 20), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 63), svdup_u64((1ULL << 1) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 16); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 2) - 1)) ,1), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 21), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 2), svdup_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 22), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 5), svdup_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 23), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 24), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 11), svdup_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 25), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 14), svdup_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 26), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 17), svdup_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 27), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 28), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 23), svdup_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 29), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 26), svdup_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 30), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 29), svdup_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 31), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 32), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 35), svdup_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 33), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 38), svdup_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 34), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 41), svdup_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 35), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 36), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 47), svdup_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 37), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 50), svdup_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 38), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 53), svdup_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 39), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 40), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 59), svdup_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 41), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 62), svdup_u64((1ULL << 2) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 32); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 1) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 42), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 1), svdup_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 43), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 44), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 7), svdup_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 45), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 10), svdup_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 46), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 13), svdup_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 47), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 48), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 19), svdup_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 49), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 22), svdup_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 50), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 25), svdup_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 51), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 52), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 31), svdup_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 53), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 34), svdup_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 54), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 37), svdup_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 55), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 56), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 43), svdup_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 57), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 46), svdup_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 58), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 49), svdup_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 59), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 60), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 55), svdup_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 61), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 58), svdup_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 62), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 61), svdup_u64((1ULL << 3) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 63), tmp_dbl); + i += svcntd(); + pg = svwhilelt_b64(i, (1024LL / 64)); + } + while (svptest_any(svptrue_b64(), pg)); + } + static void falp_4bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] svuint64_t register_0; + [[maybe_unused]] svuint64_t tmp_0; + svbool_t pg = svwhilelt_b64(static_cast(0LL), (1024LL / 64)); + int64_t i = 0; + [[maybe_unused]] svuint64_t base_0; + do + { + register_0 = svld1(pg, in + (0 * 16) + (i) + 0); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 0), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 1), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 2), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 3), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 4), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 5), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 6), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 7), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 8), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 9), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 10), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 11), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 12), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 13), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 14), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 15), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 16); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 16), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 17), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 18), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 19), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 20), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 21), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 22), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 23), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 24), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 25), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 26), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 27), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 28), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 29), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 30), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 31), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 32); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 32), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 33), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 34), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 35), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 36), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 37), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 38), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 39), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 40), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 41), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 42), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 43), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 44), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 45), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 46), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 47), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 48); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 48), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 49), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 50), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 51), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 52), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 53), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 54), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 55), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 56), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 57), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 58), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 59), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 60), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 61), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 62), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 63), tmp_dbl); + i += svcntd(); + pg = svwhilelt_b64(i, (1024LL / 64)); + } + while (svptest_any(svptrue_b64(), pg)); + } + static void falp_5bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] svuint64_t register_0; + [[maybe_unused]] svuint64_t tmp_0; + svbool_t pg = svwhilelt_b64(static_cast(0LL), (1024LL / 64)); + int64_t i = 0; + [[maybe_unused]] svuint64_t base_0; + do + { + register_0 = svld1(pg, in + (0 * 16) + (i) + 0); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 0), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 5), svdup_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 1), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 10), svdup_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 2), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 15), svdup_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 3), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 4), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 25), svdup_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 5), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 30), svdup_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 6), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 35), svdup_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 7), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 8), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 45), svdup_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 9), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 50), svdup_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 10), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 55), svdup_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 11), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 16); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 1) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 12), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 1), svdup_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 13), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 6), svdup_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 14), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 11), svdup_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 15), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 16), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 21), svdup_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 17), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 26), svdup_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 18), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 31), svdup_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 19), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 20), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 41), svdup_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 21), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 46), svdup_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 22), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 51), svdup_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 23), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 24), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 61), svdup_u64((1ULL << 3) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 32); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 2) - 1)) ,3), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 25), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 2), svdup_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 26), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 7), svdup_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 27), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 28), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 17), svdup_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 29), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 22), svdup_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 30), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 27), svdup_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 31), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 32), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 37), svdup_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 33), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 42), svdup_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 34), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 47), svdup_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 35), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 36), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 57), svdup_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 37), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 62), svdup_u64((1ULL << 2) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 48); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 3) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 38), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 3), svdup_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 39), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 40), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 13), svdup_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 41), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 18), svdup_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 42), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 23), svdup_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 43), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 44), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 33), svdup_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 45), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 38), svdup_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 46), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 43), svdup_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 47), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 48), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 53), svdup_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 49), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 58), svdup_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 50), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 63), svdup_u64((1ULL << 1) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 64); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,1), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 51), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 52), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 9), svdup_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 53), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 14), svdup_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 54), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 19), svdup_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 55), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 56), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 29), svdup_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 57), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 34), svdup_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 58), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 39), svdup_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 59), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 60), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 49), svdup_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 61), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 54), svdup_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 62), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 59), svdup_u64((1ULL << 5) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 63), tmp_dbl); + i += svcntd(); + pg = svwhilelt_b64(i, (1024LL / 64)); + } + while (svptest_any(svptrue_b64(), pg)); + } + static void falp_6bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] svuint64_t register_0; + [[maybe_unused]] svuint64_t tmp_0; + svbool_t pg = svwhilelt_b64(static_cast(0LL), (1024LL / 64)); + int64_t i = 0; + [[maybe_unused]] svuint64_t base_0; + do + { + register_0 = svld1(pg, in + (0 * 16) + (i) + 0); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 0), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 6), svdup_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 1), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 2), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 18), svdup_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 3), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 4), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 30), svdup_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 5), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 6), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 42), svdup_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 7), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 8), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 54), svdup_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 9), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 16); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 2) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 10), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 2), svdup_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 11), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 12), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 14), svdup_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 13), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 14), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 26), svdup_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 15), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 16), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 38), svdup_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 17), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 18), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 50), svdup_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 19), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 20), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 62), svdup_u64((1ULL << 2) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 32); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 21), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 22), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 10), svdup_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 23), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 24), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 22), svdup_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 25), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 26), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 34), svdup_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 27), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 28), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 46), svdup_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 29), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 30), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 58), svdup_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 31), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 48); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 32), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 6), svdup_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 33), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 34), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 18), svdup_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 35), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 36), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 30), svdup_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 37), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 38), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 42), svdup_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 39), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 40), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 54), svdup_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 41), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 64); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 2) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 42), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 2), svdup_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 43), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 44), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 14), svdup_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 45), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 46), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 26), svdup_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 47), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 48), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 38), svdup_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 49), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 50), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 50), svdup_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 51), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 52), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 62), svdup_u64((1ULL << 2) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 80); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 53), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 54), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 10), svdup_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 55), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 56), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 22), svdup_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 57), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 58), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 34), svdup_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 59), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 60), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 46), svdup_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 61), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 62), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 58), svdup_u64((1ULL << 6) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 63), tmp_dbl); + i += svcntd(); + pg = svwhilelt_b64(i, (1024LL / 64)); + } + while (svptest_any(svptrue_b64(), pg)); + } + static void falp_7bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] svuint64_t register_0; + [[maybe_unused]] svuint64_t tmp_0; + svbool_t pg = svwhilelt_b64(static_cast(0LL), (1024LL / 64)); + int64_t i = 0; + [[maybe_unused]] svuint64_t base_0; + do + { + register_0 = svld1(pg, in + (0 * 16) + (i) + 0); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 0), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 7), svdup_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 1), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 14), svdup_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 2), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 21), svdup_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 3), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 4), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 35), svdup_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 5), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 42), svdup_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 6), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 49), svdup_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 7), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 8), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 63), svdup_u64((1ULL << 1) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 16); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 6) - 1)) ,1), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 9), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 6), svdup_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 10), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 13), svdup_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 11), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 12), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 27), svdup_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 13), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 34), svdup_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 14), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 41), svdup_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 15), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 16), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 55), svdup_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 17), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 62), svdup_u64((1ULL << 2) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 32); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 5) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 18), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 5), svdup_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 19), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 20), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 19), svdup_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 21), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 26), svdup_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 22), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 33), svdup_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 23), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 24), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 47), svdup_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 25), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 54), svdup_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 26), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 61), svdup_u64((1ULL << 3) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 48); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,3), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 27), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 28), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 11), svdup_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 29), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 18), svdup_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 30), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 25), svdup_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 31), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 32), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 39), svdup_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 33), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 46), svdup_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 34), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 53), svdup_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 35), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 64); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 3) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 36), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 3), svdup_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 37), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 10), svdup_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 38), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 17), svdup_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 39), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 40), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 31), svdup_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 41), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 38), svdup_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 42), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 45), svdup_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 43), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 44), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 59), svdup_u64((1ULL << 5) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 80); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 2) - 1)) ,5), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 45), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 2), svdup_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 46), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 9), svdup_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 47), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 48), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 23), svdup_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 49), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 30), svdup_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 50), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 37), svdup_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 51), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 52), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 51), svdup_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 53), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 58), svdup_u64((1ULL << 6) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 96); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 1) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 54), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 1), svdup_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 55), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 56), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 15), svdup_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 57), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 22), svdup_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 58), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 29), svdup_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 59), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 60), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 43), svdup_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 61), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 50), svdup_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 62), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 57), svdup_u64((1ULL << 7) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 63), tmp_dbl); + i += svcntd(); + pg = svwhilelt_b64(i, (1024LL / 64)); + } + while (svptest_any(svptrue_b64(), pg)); + } + static void falp_8bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] svuint64_t register_0; + [[maybe_unused]] svuint64_t tmp_0; + svbool_t pg = svwhilelt_b64(static_cast(0LL), (1024LL / 64)); + int64_t i = 0; + [[maybe_unused]] svuint64_t base_0; + do + { + register_0 = svld1(pg, in + (0 * 16) + (i) + 0); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 0), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 1), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 2), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 3), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 4), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 5), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 6), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 7), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 16); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 8), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 9), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 10), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 11), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 12), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 13), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 14), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 15), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 32); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 16), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 17), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 18), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 19), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 20), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 21), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 22), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 23), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 48); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 24), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 25), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 26), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 27), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 28), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 29), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 30), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 31), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 64); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 32), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 33), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 34), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 35), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 36), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 37), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 38), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 39), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 80); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 40), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 41), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 42), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 43), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 44), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 45), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 46), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 47), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 96); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 48), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 49), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 50), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 51), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 52), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 53), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 54), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 55), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 112); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 56), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 57), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 58), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 59), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 60), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 61), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 62), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 63), tmp_dbl); + i += svcntd(); + pg = svwhilelt_b64(i, (1024LL / 64)); + } + while (svptest_any(svptrue_b64(), pg)); + } + static void falp_9bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] svuint64_t register_0; + [[maybe_unused]] svuint64_t tmp_0; + svbool_t pg = svwhilelt_b64(static_cast(0LL), (1024LL / 64)); + int64_t i = 0; + [[maybe_unused]] svuint64_t base_0; + do + { + register_0 = svld1(pg, in + (0 * 16) + (i) + 0); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 0), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 9), svdup_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 1), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 18), svdup_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 2), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 27), svdup_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 3), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 4), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 45), svdup_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 5), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 54), svdup_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 6), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 63), svdup_u64((1ULL << 1) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 16); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,1), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 7), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 8), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 17), svdup_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 9), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 26), svdup_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 10), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 35), svdup_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 11), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 12), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 53), svdup_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 13), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 62), svdup_u64((1ULL << 2) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 32); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 7) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 14), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 7), svdup_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 15), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 16), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 25), svdup_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 17), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 34), svdup_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 18), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 43), svdup_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 19), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 20), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 61), svdup_u64((1ULL << 3) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 48); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 6) - 1)) ,3), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 21), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 6), svdup_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 22), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 15), svdup_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 23), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 24), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 33), svdup_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 25), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 42), svdup_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 26), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 51), svdup_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 27), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 64); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 5) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 28), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 5), svdup_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 29), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 14), svdup_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 30), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 23), svdup_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 31), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 32), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 41), svdup_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 33), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 50), svdup_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 34), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 59), svdup_u64((1ULL << 5) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 80); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,5), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 35), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 36), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 13), svdup_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 37), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 22), svdup_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 38), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 31), svdup_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 39), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 40), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 49), svdup_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 41), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 58), svdup_u64((1ULL << 6) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 96); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 3) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 42), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 3), svdup_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 43), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 44), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 21), svdup_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 45), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 30), svdup_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 46), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 39), svdup_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 47), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 48), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 57), svdup_u64((1ULL << 7) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 112); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 2) - 1)) ,7), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 49), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 2), svdup_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 50), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 11), svdup_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 51), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 52), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 29), svdup_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 53), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 38), svdup_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 54), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 47), svdup_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 55), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 128); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 1) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 56), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 1), svdup_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 57), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 10), svdup_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 58), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 19), svdup_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 59), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 60), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 37), svdup_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 61), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 46), svdup_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 62), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 55), svdup_u64((1ULL << 9) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 63), tmp_dbl); + i += svcntd(); + pg = svwhilelt_b64(i, (1024LL / 64)); + } + while (svptest_any(svptrue_b64(), pg)); + } + static void falp_10bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] svuint64_t register_0; + [[maybe_unused]] svuint64_t tmp_0; + svbool_t pg = svwhilelt_b64(static_cast(0LL), (1024LL / 64)); + int64_t i = 0; + [[maybe_unused]] svuint64_t base_0; + do + { + register_0 = svld1(pg, in + (0 * 16) + (i) + 0); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 0), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 10), svdup_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 1), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 2), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 30), svdup_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 3), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 4), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 50), svdup_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 5), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 16); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 6) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 6), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 6), svdup_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 7), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 8), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 26), svdup_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 9), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 10), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 46), svdup_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 11), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 32); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 2) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 12), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 2), svdup_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 13), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 14), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 22), svdup_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 15), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 16), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 42), svdup_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 17), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 18), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 62), svdup_u64((1ULL << 2) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 48); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 19), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 20), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 18), svdup_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 21), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 22), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 38), svdup_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 23), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 24), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 58), svdup_u64((1ULL << 6) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 64); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 25), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 26), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 14), svdup_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 27), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 28), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 34), svdup_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 29), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 30), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 54), svdup_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 31), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 80); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 32), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 10), svdup_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 33), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 34), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 30), svdup_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 35), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 36), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 50), svdup_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 37), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 96); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 6) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 38), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 6), svdup_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 39), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 40), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 26), svdup_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 41), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 42), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 46), svdup_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 43), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 112); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 2) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 44), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 2), svdup_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 45), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 46), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 22), svdup_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 47), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 48), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 42), svdup_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 49), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 50), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 62), svdup_u64((1ULL << 2) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 128); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 51), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 52), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 18), svdup_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 53), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 54), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 38), svdup_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 55), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 56), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 58), svdup_u64((1ULL << 6) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 144); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 57), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 58), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 14), svdup_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 59), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 60), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 34), svdup_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 61), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 62), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 54), svdup_u64((1ULL << 10) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 63), tmp_dbl); + i += svcntd(); + pg = svwhilelt_b64(i, (1024LL / 64)); + } + while (svptest_any(svptrue_b64(), pg)); + } + static void falp_11bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] svuint64_t register_0; + [[maybe_unused]] svuint64_t tmp_0; + svbool_t pg = svwhilelt_b64(static_cast(0LL), (1024LL / 64)); + int64_t i = 0; + [[maybe_unused]] svuint64_t base_0; + do + { + register_0 = svld1(pg, in + (0 * 16) + (i) + 0); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 0), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 11), svdup_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 1), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 22), svdup_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 2), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 33), svdup_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 3), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 4), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 55), svdup_u64((1ULL << 9) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 16); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 2) - 1)) ,9), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 5), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 2), svdup_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 6), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 13), svdup_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 7), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 8), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 35), svdup_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 9), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 46), svdup_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 10), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 57), svdup_u64((1ULL << 7) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 32); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,7), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 11), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 12), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 15), svdup_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 13), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 26), svdup_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 14), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 37), svdup_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 15), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 16), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 59), svdup_u64((1ULL << 5) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 48); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 6) - 1)) ,5), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 17), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 6), svdup_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 18), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 17), svdup_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 19), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 20), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 39), svdup_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 21), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 50), svdup_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 22), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 61), svdup_u64((1ULL << 3) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 64); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,3), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 23), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 24), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 19), svdup_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 25), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 30), svdup_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 26), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 41), svdup_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 27), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 28), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 63), svdup_u64((1ULL << 1) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 80); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 10) - 1)) ,1), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 29), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 10), svdup_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 30), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 21), svdup_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 31), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 32), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 43), svdup_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 33), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 54), svdup_u64((1ULL << 10) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 96); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 1) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 34), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 1), svdup_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 35), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 36), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 23), svdup_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 37), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 34), svdup_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 38), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 45), svdup_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 39), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 112); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 3) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 40), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 3), svdup_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 41), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 14), svdup_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 42), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 25), svdup_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 43), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 44), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 47), svdup_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 45), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 58), svdup_u64((1ULL << 6) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 128); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 5) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 46), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 5), svdup_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 47), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 48), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 27), svdup_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 49), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 38), svdup_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 50), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 49), svdup_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 51), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 144); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 7) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 52), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 7), svdup_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 53), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 18), svdup_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 54), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 29), svdup_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 55), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 56), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 51), svdup_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 57), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 62), svdup_u64((1ULL << 2) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 160); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 9) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 58), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 9), svdup_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 59), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 60), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 31), svdup_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 61), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 42), svdup_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 62), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 53), svdup_u64((1ULL << 11) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 63), tmp_dbl); + i += svcntd(); + pg = svwhilelt_b64(i, (1024LL / 64)); + } + while (svptest_any(svptrue_b64(), pg)); + } + static void falp_12bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] svuint64_t register_0; + [[maybe_unused]] svuint64_t tmp_0; + svbool_t pg = svwhilelt_b64(static_cast(0LL), (1024LL / 64)); + int64_t i = 0; + [[maybe_unused]] svuint64_t base_0; + do + { + register_0 = svld1(pg, in + (0 * 16) + (i) + 0); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 0), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 1), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 2), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 3), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 4), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 16); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 5), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 6), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 7), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 8), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 9), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 32); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 10), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 11), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 12), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 13), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 14), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 15), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 48); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 16), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 17), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 18), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 19), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 20), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 64); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 21), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 22), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 23), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 24), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 25), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 80); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 26), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 27), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 28), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 29), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 30), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 31), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 96); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 32), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 33), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 34), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 35), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 36), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 112); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 37), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 38), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 39), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 40), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 41), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 128); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 42), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 43), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 44), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 45), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 46), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 47), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 144); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 48), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 49), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 50), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 51), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 52), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 160); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 53), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 54), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 55), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 56), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 57), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 176); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 58), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 59), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 60), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 61), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 62), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 63), tmp_dbl); + i += svcntd(); + pg = svwhilelt_b64(i, (1024LL / 64)); + } + while (svptest_any(svptrue_b64(), pg)); + } + static void falp_13bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] svuint64_t register_0; + [[maybe_unused]] svuint64_t tmp_0; + svbool_t pg = svwhilelt_b64(static_cast(0LL), (1024LL / 64)); + int64_t i = 0; + [[maybe_unused]] svuint64_t base_0; + do + { + register_0 = svld1(pg, in + (0 * 16) + (i) + 0); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 0), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 13), svdup_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 1), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 26), svdup_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 2), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 39), svdup_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 3), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 16); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 1) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 4), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 1), svdup_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 5), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 14), svdup_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 6), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 27), svdup_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 7), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 8), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 53), svdup_u64((1ULL << 11) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 32); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 2) - 1)) ,11), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 9), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 2), svdup_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 10), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 15), svdup_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 11), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 12), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 41), svdup_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 13), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 54), svdup_u64((1ULL << 10) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 48); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 3) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 14), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 3), svdup_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 15), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 16), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 29), svdup_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 17), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 42), svdup_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 18), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 55), svdup_u64((1ULL << 9) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 64); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,9), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 19), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 20), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 17), svdup_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 21), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 30), svdup_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 22), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 43), svdup_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 23), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 80); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 5) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 24), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 5), svdup_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 25), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 18), svdup_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 26), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 31), svdup_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 27), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 28), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 57), svdup_u64((1ULL << 7) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 96); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 6) - 1)) ,7), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 29), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 6), svdup_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 30), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 19), svdup_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 31), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 32), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 45), svdup_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 33), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 58), svdup_u64((1ULL << 6) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 112); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 7) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 34), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 7), svdup_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 35), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 36), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 33), svdup_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 37), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 46), svdup_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 38), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 59), svdup_u64((1ULL << 5) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 128); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,5), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 39), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 40), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 21), svdup_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 41), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 34), svdup_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 42), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 47), svdup_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 43), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 144); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 9) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 44), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 9), svdup_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 45), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 22), svdup_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 46), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 35), svdup_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 47), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 48), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 61), svdup_u64((1ULL << 3) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 160); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 10) - 1)) ,3), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 49), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 10), svdup_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 50), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 23), svdup_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 51), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 52), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 49), svdup_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 53), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 62), svdup_u64((1ULL << 2) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 176); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 11) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 54), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 11), svdup_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 55), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 56), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 37), svdup_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 57), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 50), svdup_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 58), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 63), svdup_u64((1ULL << 1) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 192); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,1), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 59), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 60), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 25), svdup_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 61), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 38), svdup_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 62), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 51), svdup_u64((1ULL << 13) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 63), tmp_dbl); + i += svcntd(); + pg = svwhilelt_b64(i, (1024LL / 64)); + } + while (svptest_any(svptrue_b64(), pg)); + } + static void falp_14bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] svuint64_t register_0; + [[maybe_unused]] svuint64_t tmp_0; + svbool_t pg = svwhilelt_b64(static_cast(0LL), (1024LL / 64)); + int64_t i = 0; + [[maybe_unused]] svuint64_t base_0; + do + { + register_0 = svld1(pg, in + (0 * 16) + (i) + 0); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 0), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 14), svdup_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 1), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 2), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 42), svdup_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 3), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 16); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 6) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 4), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 6), svdup_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 5), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 6), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 34), svdup_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 7), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 8), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 62), svdup_u64((1ULL << 2) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 32); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 9), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 10), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 26), svdup_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 11), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 12), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 54), svdup_u64((1ULL << 10) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 48); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 13), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 14), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 18), svdup_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 15), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 16), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 46), svdup_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 17), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 64); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 10) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 18), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 10), svdup_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 19), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 20), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 38), svdup_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 21), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 80); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 2) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 22), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 2), svdup_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 23), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 24), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 30), svdup_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 25), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 26), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 58), svdup_u64((1ULL << 6) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 96); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 27), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 28), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 22), svdup_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 29), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 30), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 50), svdup_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 31), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 112); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 32), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 14), svdup_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 33), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 34), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 42), svdup_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 35), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 128); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 6) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 36), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 6), svdup_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 37), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 38), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 34), svdup_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 39), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 40), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 62), svdup_u64((1ULL << 2) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 144); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 41), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 42), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 26), svdup_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 43), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 44), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 54), svdup_u64((1ULL << 10) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 160); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 45), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 46), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 18), svdup_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 47), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 48), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 46), svdup_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 49), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 176); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 10) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 50), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 10), svdup_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 51), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 52), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 38), svdup_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 53), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 192); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 2) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 54), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 2), svdup_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 55), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 56), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 30), svdup_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 57), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 58), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 58), svdup_u64((1ULL << 6) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 208); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 59), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 60), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 22), svdup_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 61), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 62), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 50), svdup_u64((1ULL << 14) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 63), tmp_dbl); + i += svcntd(); + pg = svwhilelt_b64(i, (1024LL / 64)); + } + while (svptest_any(svptrue_b64(), pg)); + } + static void falp_15bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] svuint64_t register_0; + [[maybe_unused]] svuint64_t tmp_0; + svbool_t pg = svwhilelt_b64(static_cast(0LL), (1024LL / 64)); + int64_t i = 0; + [[maybe_unused]] svuint64_t base_0; + do + { + register_0 = svld1(pg, in + (0 * 16) + (i) + 0); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 0), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 15), svdup_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 1), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 30), svdup_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 2), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 45), svdup_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 3), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 16); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 11) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 4), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 11), svdup_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 5), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 26), svdup_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 6), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 41), svdup_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 7), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 32); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 7) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 8), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 7), svdup_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 9), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 22), svdup_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 10), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 37), svdup_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 11), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 48); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 3) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 12), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 3), svdup_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 13), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 18), svdup_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 14), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 33), svdup_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 15), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 16), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 63), svdup_u64((1ULL << 1) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 64); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 14) - 1)) ,1), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 17), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 14), svdup_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 18), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 29), svdup_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 19), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 20), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 59), svdup_u64((1ULL << 5) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 80); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 10) - 1)) ,5), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 21), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 10), svdup_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 22), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 25), svdup_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 23), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 24), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 55), svdup_u64((1ULL << 9) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 96); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 6) - 1)) ,9), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 25), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 6), svdup_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 26), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 21), svdup_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 27), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 28), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 51), svdup_u64((1ULL << 13) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 112); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 2) - 1)) ,13), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 29), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 2), svdup_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 30), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 17), svdup_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 31), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 32), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 47), svdup_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 33), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 62), svdup_u64((1ULL << 2) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 128); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 13) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 34), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 13), svdup_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 35), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 36), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 43), svdup_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 37), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 58), svdup_u64((1ULL << 6) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 144); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 9) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 38), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 9), svdup_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 39), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 40), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 39), svdup_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 41), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 54), svdup_u64((1ULL << 10) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 160); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 5) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 42), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 5), svdup_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 43), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 44), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 35), svdup_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 45), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 50), svdup_u64((1ULL << 14) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 176); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 1) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 46), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 1), svdup_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 47), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 48), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 31), svdup_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 49), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 46), svdup_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 50), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 61), svdup_u64((1ULL << 3) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 192); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,3), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 51), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 52), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 27), svdup_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 53), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 42), svdup_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 54), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 57), svdup_u64((1ULL << 7) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 208); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,7), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 55), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 56), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 23), svdup_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 57), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 38), svdup_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 58), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 53), svdup_u64((1ULL << 11) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 224); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,11), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 59), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 60), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 19), svdup_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 61), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 34), svdup_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 62), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 49), svdup_u64((1ULL << 15) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 63), tmp_dbl); + i += svcntd(); + pg = svwhilelt_b64(i, (1024LL / 64)); + } + while (svptest_any(svptrue_b64(), pg)); + } + static void falp_16bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] svuint64_t register_0; + [[maybe_unused]] svuint64_t tmp_0; + svbool_t pg = svwhilelt_b64(static_cast(0LL), (1024LL / 64)); + int64_t i = 0; + [[maybe_unused]] svuint64_t base_0; + do + { + register_0 = svld1(pg, in + (0 * 16) + (i) + 0); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 0), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 1), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 2), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 3), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 16); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 4), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 5), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 6), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 7), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 32); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 8), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 9), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 10), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 11), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 48); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 12), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 13), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 14), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 15), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 64); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 16), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 17), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 18), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 19), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 80); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 20), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 21), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 22), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 23), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 96); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 24), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 25), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 26), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 27), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 112); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 28), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 29), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 30), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 31), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 128); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 32), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 33), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 34), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 35), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 144); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 36), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 37), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 38), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 39), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 160); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 40), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 41), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 42), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 43), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 176); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 44), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 45), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 46), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 47), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 192); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 48), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 49), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 50), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 51), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 208); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 52), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 53), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 54), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 55), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 224); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 56), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 57), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 58), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 59), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 240); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 60), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 61), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 62), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 63), tmp_dbl); + i += svcntd(); + pg = svwhilelt_b64(i, (1024LL / 64)); + } + while (svptest_any(svptrue_b64(), pg)); + } + static void falp_17bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] svuint64_t register_0; + [[maybe_unused]] svuint64_t tmp_0; + svbool_t pg = svwhilelt_b64(static_cast(0LL), (1024LL / 64)); + int64_t i = 0; + [[maybe_unused]] svuint64_t base_0; + do + { + register_0 = svld1(pg, in + (0 * 16) + (i) + 0); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 0), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 17), svdup_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 1), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 34), svdup_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 2), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 51), svdup_u64((1ULL << 13) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 16); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,13), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 3), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 4), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 21), svdup_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 5), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 38), svdup_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 6), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 55), svdup_u64((1ULL << 9) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 32); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,9), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 7), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 8), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 25), svdup_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 9), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 42), svdup_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 10), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 59), svdup_u64((1ULL << 5) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 48); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,5), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 11), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 12), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 29), svdup_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 13), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 46), svdup_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 14), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 63), svdup_u64((1ULL << 1) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 64); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,1), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 15), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 16), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 33), svdup_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 17), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 50), svdup_u64((1ULL << 14) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 80); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 3) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 18), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 3), svdup_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 19), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 20), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 37), svdup_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 21), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 54), svdup_u64((1ULL << 10) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 96); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 7) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 22), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 7), svdup_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 23), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 24), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 41), svdup_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 25), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 58), svdup_u64((1ULL << 6) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 112); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 11) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 26), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 11), svdup_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 27), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 28), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 45), svdup_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 29), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 62), svdup_u64((1ULL << 2) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 128); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 15) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 30), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 15), svdup_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 31), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 32), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 49), svdup_u64((1ULL << 15) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 144); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 2) - 1)) ,15), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 33), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 2), svdup_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 34), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 19), svdup_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 35), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 36), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 53), svdup_u64((1ULL << 11) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 160); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 6) - 1)) ,11), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 37), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 6), svdup_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 38), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 23), svdup_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 39), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 40), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 57), svdup_u64((1ULL << 7) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 176); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 10) - 1)) ,7), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 41), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 10), svdup_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 42), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 27), svdup_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 43), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 44), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 61), svdup_u64((1ULL << 3) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 192); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 14) - 1)) ,3), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 45), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 14), svdup_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 46), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 31), svdup_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 47), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 208); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 1) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 48), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 1), svdup_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 49), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 18), svdup_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 50), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 35), svdup_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 51), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 224); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 5) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 52), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 5), svdup_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 53), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 22), svdup_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 54), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 39), svdup_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 55), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 240); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 9) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 56), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 9), svdup_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 57), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 26), svdup_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 58), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 43), svdup_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 59), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 256); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 13) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 60), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 13), svdup_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 61), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 30), svdup_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 62), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 47), svdup_u64((1ULL << 17) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 63), tmp_dbl); + i += svcntd(); + pg = svwhilelt_b64(i, (1024LL / 64)); + } + while (svptest_any(svptrue_b64(), pg)); + } + static void falp_18bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] svuint64_t register_0; + [[maybe_unused]] svuint64_t tmp_0; + svbool_t pg = svwhilelt_b64(static_cast(0LL), (1024LL / 64)); + int64_t i = 0; + [[maybe_unused]] svuint64_t base_0; + do + { + register_0 = svld1(pg, in + (0 * 16) + (i) + 0); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 0), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 18), svdup_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 1), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 2), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 54), svdup_u64((1ULL << 10) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 16); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 3), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 4), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 26), svdup_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 5), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 6), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 62), svdup_u64((1ULL << 2) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 32); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 7), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 8), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 34), svdup_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 9), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 48); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 6) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 10), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 6), svdup_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 11), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 12), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 42), svdup_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 13), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 64); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 14) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 14), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 14), svdup_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 15), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 16), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 50), svdup_u64((1ULL << 14) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 80); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 17), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 18), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 22), svdup_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 19), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 20), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 58), svdup_u64((1ULL << 6) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 96); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 21), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 22), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 30), svdup_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 23), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 112); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 2) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 24), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 2), svdup_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 25), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 26), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 38), svdup_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 27), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 128); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 10) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 28), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 10), svdup_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 29), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 30), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 46), svdup_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 31), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 144); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 32), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 18), svdup_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 33), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 34), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 54), svdup_u64((1ULL << 10) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 160); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 35), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 36), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 26), svdup_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 37), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 38), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 62), svdup_u64((1ULL << 2) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 176); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 39), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 40), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 34), svdup_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 41), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 192); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 6) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 42), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 6), svdup_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 43), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 44), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 42), svdup_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 45), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 208); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 14) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 46), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 14), svdup_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 47), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 48), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 50), svdup_u64((1ULL << 14) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 224); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 49), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 50), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 22), svdup_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 51), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 52), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 58), svdup_u64((1ULL << 6) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 240); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 53), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 54), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 30), svdup_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 55), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 256); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 2) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 56), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 2), svdup_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 57), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 58), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 38), svdup_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 59), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 272); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 10) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 60), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 10), svdup_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 61), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 62), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 46), svdup_u64((1ULL << 18) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 63), tmp_dbl); + i += svcntd(); + pg = svwhilelt_b64(i, (1024LL / 64)); + } + while (svptest_any(svptrue_b64(), pg)); + } + static void falp_19bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] svuint64_t register_0; + [[maybe_unused]] svuint64_t tmp_0; + svbool_t pg = svwhilelt_b64(static_cast(0LL), (1024LL / 64)); + int64_t i = 0; + [[maybe_unused]] svuint64_t base_0; + do + { + register_0 = svld1(pg, in + (0 * 16) + (i) + 0); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 0), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 19), svdup_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 1), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 38), svdup_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 2), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 57), svdup_u64((1ULL << 7) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 16); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,7), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 3), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 4), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 31), svdup_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 5), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 50), svdup_u64((1ULL << 14) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 32); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 5) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 6), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 5), svdup_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 7), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 8), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 43), svdup_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 9), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 62), svdup_u64((1ULL << 2) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 48); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 17) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 10), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 17), svdup_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 11), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 12), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 55), svdup_u64((1ULL << 9) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 64); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 10) - 1)) ,9), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 13), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 10), svdup_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 14), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 29), svdup_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 15), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 80); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 3) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 16), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 3), svdup_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 17), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 22), svdup_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 18), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 41), svdup_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 19), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 96); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 15) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 20), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 15), svdup_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 21), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 34), svdup_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 22), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 53), svdup_u64((1ULL << 11) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 112); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,11), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 23), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 24), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 27), svdup_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 25), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 46), svdup_u64((1ULL << 18) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 128); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 1) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 26), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 1), svdup_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 27), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 28), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 39), svdup_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 29), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 58), svdup_u64((1ULL << 6) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 144); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 13) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 30), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 13), svdup_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 31), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 32), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 51), svdup_u64((1ULL << 13) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 160); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 6) - 1)) ,13), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 33), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 6), svdup_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 34), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 25), svdup_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 35), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 36), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 63), svdup_u64((1ULL << 1) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 176); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 18) - 1)) ,1), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 37), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 18), svdup_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 38), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 37), svdup_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 39), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 192); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 11) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 40), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 11), svdup_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 41), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 30), svdup_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 42), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 49), svdup_u64((1ULL << 15) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 208); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,15), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 43), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 44), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 23), svdup_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 45), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 42), svdup_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 46), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 61), svdup_u64((1ULL << 3) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 224); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,3), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 47), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 48), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 35), svdup_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 49), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 54), svdup_u64((1ULL << 10) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 240); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 9) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 50), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 9), svdup_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 51), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 52), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 47), svdup_u64((1ULL << 17) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 256); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 2) - 1)) ,17), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 53), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 2), svdup_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 54), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 21), svdup_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 55), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 56), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 59), svdup_u64((1ULL << 5) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 272); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 14) - 1)) ,5), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 57), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 14), svdup_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 58), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 33), svdup_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 59), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 288); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 7) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 60), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 7), svdup_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 61), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 26), svdup_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 62), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 45), svdup_u64((1ULL << 19) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 63), tmp_dbl); + i += svcntd(); + pg = svwhilelt_b64(i, (1024LL / 64)); + } + while (svptest_any(svptrue_b64(), pg)); + } + static void falp_20bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] svuint64_t register_0; + [[maybe_unused]] svuint64_t tmp_0; + svbool_t pg = svwhilelt_b64(static_cast(0LL), (1024LL / 64)); + int64_t i = 0; + [[maybe_unused]] svuint64_t base_0; + do + { + register_0 = svld1(pg, in + (0 * 16) + (i) + 0); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 0), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 1), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 2), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 16); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 3), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 4), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 5), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 32); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 6), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 7), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 8), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 48); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 9), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 10), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 11), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 64); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 12), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 13), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 14), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 15), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 80); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 16), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 17), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 18), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 96); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 19), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 20), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 21), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 112); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 22), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 23), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 24), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 128); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 25), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 26), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 27), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 144); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 28), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 29), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 30), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 31), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 160); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 32), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 33), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 34), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 176); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 35), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 36), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 37), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 192); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 38), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 39), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 40), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 208); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 41), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 42), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 43), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 224); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 44), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 45), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 46), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 47), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 240); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 48), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 49), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 50), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 256); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 51), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 52), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 53), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 272); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 54), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 55), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 56), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 288); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 57), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 58), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 59), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 304); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 60), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 61), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 62), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 20) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 63), tmp_dbl); + i += svcntd(); + pg = svwhilelt_b64(i, (1024LL / 64)); + } + while (svptest_any(svptrue_b64(), pg)); + } + static void falp_21bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] svuint64_t register_0; + [[maybe_unused]] svuint64_t tmp_0; + svbool_t pg = svwhilelt_b64(static_cast(0LL), (1024LL / 64)); + int64_t i = 0; + [[maybe_unused]] svuint64_t base_0; + do + { + register_0 = svld1(pg, in + (0 * 16) + (i) + 0); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 0), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 21), svdup_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 1), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 42), svdup_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 2), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 63), svdup_u64((1ULL << 1) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 16); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 20) - 1)) ,1), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 3), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 4), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 41), svdup_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 5), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 62), svdup_u64((1ULL << 2) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 32); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 19) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 6), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 19), svdup_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 7), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 8), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 61), svdup_u64((1ULL << 3) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 48); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 18) - 1)) ,3), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 9), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 18), svdup_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 10), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 39), svdup_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 11), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 64); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 17) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 12), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 17), svdup_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 13), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 38), svdup_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 14), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 59), svdup_u64((1ULL << 5) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 80); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,5), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 15), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 16), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 37), svdup_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 17), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 58), svdup_u64((1ULL << 6) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 96); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 15) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 18), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 15), svdup_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 19), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 20), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 57), svdup_u64((1ULL << 7) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 112); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 14) - 1)) ,7), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 21), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 14), svdup_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 22), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 35), svdup_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 23), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 128); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 13) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 24), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 13), svdup_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 25), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 34), svdup_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 26), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 55), svdup_u64((1ULL << 9) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 144); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,9), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 27), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 28), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 33), svdup_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 29), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 54), svdup_u64((1ULL << 10) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 160); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 11) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 30), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 11), svdup_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 31), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 32), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 53), svdup_u64((1ULL << 11) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 176); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 10) - 1)) ,11), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 33), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 10), svdup_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 34), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 31), svdup_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 35), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 192); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 9) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 36), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 9), svdup_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 37), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 30), svdup_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 38), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 51), svdup_u64((1ULL << 13) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 208); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,13), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 39), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 40), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 29), svdup_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 41), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 50), svdup_u64((1ULL << 14) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 224); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 7) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 42), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 7), svdup_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 43), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 44), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 49), svdup_u64((1ULL << 15) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 240); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 6) - 1)) ,15), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 45), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 6), svdup_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 46), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 27), svdup_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 47), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 256); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 5) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 48), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 5), svdup_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 49), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 26), svdup_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 50), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 47), svdup_u64((1ULL << 17) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 272); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,17), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 51), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 52), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 25), svdup_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 53), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 46), svdup_u64((1ULL << 18) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 288); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 3) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 54), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 3), svdup_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 55), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 56), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 45), svdup_u64((1ULL << 19) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 304); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 2) - 1)) ,19), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 57), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 2), svdup_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 58), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 23), svdup_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 59), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 20) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 320); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 1) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 60), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 1), svdup_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 61), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 22), svdup_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 62), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 43), svdup_u64((1ULL << 21) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 63), tmp_dbl); + i += svcntd(); + pg = svwhilelt_b64(i, (1024LL / 64)); + } + while (svptest_any(svptrue_b64(), pg)); + } + static void falp_22bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] svuint64_t register_0; + [[maybe_unused]] svuint64_t tmp_0; + svbool_t pg = svwhilelt_b64(static_cast(0LL), (1024LL / 64)); + int64_t i = 0; + [[maybe_unused]] svuint64_t base_0; + do + { + register_0 = svld1(pg, in + (0 * 16) + (i) + 0); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 0), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 22), svdup_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 1), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 20) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 16); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 2) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 2), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 2), svdup_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 3), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 4), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 46), svdup_u64((1ULL << 18) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 32); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 5), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 6), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 26), svdup_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 7), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 48); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 6) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 8), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 6), svdup_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 9), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 10), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 50), svdup_u64((1ULL << 14) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 64); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 11), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 12), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 30), svdup_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 13), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 80); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 10) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 14), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 10), svdup_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 15), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 16), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 54), svdup_u64((1ULL << 10) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 96); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 17), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 18), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 34), svdup_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 19), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 112); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 14) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 20), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 14), svdup_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 21), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 22), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 58), svdup_u64((1ULL << 6) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 128); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 23), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 24), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 38), svdup_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 25), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 144); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 18) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 26), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 18), svdup_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 27), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 28), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 62), svdup_u64((1ULL << 2) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 160); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 20) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 29), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 30), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 42), svdup_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 31), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 176); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 32), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 22), svdup_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 33), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 20) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 192); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 2) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 34), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 2), svdup_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 35), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 36), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 46), svdup_u64((1ULL << 18) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 208); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 37), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 38), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 26), svdup_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 39), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 224); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 6) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 40), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 6), svdup_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 41), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 42), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 50), svdup_u64((1ULL << 14) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 240); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 43), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 44), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 30), svdup_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 45), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 256); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 10) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 46), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 10), svdup_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 47), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 48), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 54), svdup_u64((1ULL << 10) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 272); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 49), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 50), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 34), svdup_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 51), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 288); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 14) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 52), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 14), svdup_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 53), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 54), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 58), svdup_u64((1ULL << 6) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 304); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 55), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 56), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 38), svdup_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 57), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 320); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 18) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 58), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 18), svdup_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 59), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 60), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 62), svdup_u64((1ULL << 2) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 336); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 20) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 61), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 62), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 42), svdup_u64((1ULL << 22) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 63), tmp_dbl); + i += svcntd(); + pg = svwhilelt_b64(i, (1024LL / 64)); + } + while (svptest_any(svptrue_b64(), pg)); + } + static void falp_23bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] svuint64_t register_0; + [[maybe_unused]] svuint64_t tmp_0; + svbool_t pg = svwhilelt_b64(static_cast(0LL), (1024LL / 64)); + int64_t i = 0; + [[maybe_unused]] svuint64_t base_0; + do + { + register_0 = svld1(pg, in + (0 * 16) + (i) + 0); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 0), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 23), svdup_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 1), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 46), svdup_u64((1ULL << 18) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 16); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 5) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 2), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 5), svdup_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 3), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 4), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 51), svdup_u64((1ULL << 13) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 32); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 10) - 1)) ,13), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 5), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 10), svdup_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 6), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 33), svdup_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 7), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 48); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 15) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 8), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 15), svdup_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 9), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 38), svdup_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 10), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 61), svdup_u64((1ULL << 3) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 64); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 20) - 1)) ,3), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 11), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 12), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 43), svdup_u64((1ULL << 21) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 80); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 2) - 1)) ,21), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 13), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 2), svdup_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 14), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 25), svdup_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 15), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 96); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 7) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 16), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 7), svdup_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 17), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 30), svdup_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 18), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 53), svdup_u64((1ULL << 11) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 112); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,11), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 19), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 20), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 35), svdup_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 21), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 58), svdup_u64((1ULL << 6) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 128); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 17) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 22), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 17), svdup_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 23), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 24), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 63), svdup_u64((1ULL << 1) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 144); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 22) - 1)) ,1), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 25), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 22), svdup_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 26), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 45), svdup_u64((1ULL << 19) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 160); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,19), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 27), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 28), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 27), svdup_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 29), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 50), svdup_u64((1ULL << 14) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 176); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 9) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 30), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 9), svdup_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 31), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 32), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 55), svdup_u64((1ULL << 9) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 192); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 14) - 1)) ,9), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 33), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 14), svdup_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 34), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 37), svdup_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 35), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 208); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 19) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 36), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 19), svdup_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 37), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 42), svdup_u64((1ULL << 22) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 224); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 1) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 38), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 1), svdup_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 39), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 40), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 47), svdup_u64((1ULL << 17) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 240); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 6) - 1)) ,17), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 41), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 6), svdup_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 42), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 29), svdup_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 43), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 256); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 11) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 44), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 11), svdup_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 45), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 34), svdup_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 46), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 57), svdup_u64((1ULL << 7) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 272); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,7), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 47), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 48), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 39), svdup_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 49), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 62), svdup_u64((1ULL << 2) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 288); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 21) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 50), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 21), svdup_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 51), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 20) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 304); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 3) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 52), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 3), svdup_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 53), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 26), svdup_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 54), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 49), svdup_u64((1ULL << 15) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 320); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,15), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 55), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 56), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 31), svdup_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 57), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 54), svdup_u64((1ULL << 10) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 336); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 13) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 58), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 13), svdup_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 59), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 60), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 59), svdup_u64((1ULL << 5) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 352); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 18) - 1)) ,5), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 61), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 18), svdup_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 62), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 41), svdup_u64((1ULL << 23) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 63), tmp_dbl); + i += svcntd(); + pg = svwhilelt_b64(i, (1024LL / 64)); + } + while (svptest_any(svptrue_b64(), pg)); + } + static void falp_24bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] svuint64_t register_0; + [[maybe_unused]] svuint64_t tmp_0; + svbool_t pg = svwhilelt_b64(static_cast(0LL), (1024LL / 64)); + int64_t i = 0; + [[maybe_unused]] svuint64_t base_0; + do + { + register_0 = svld1(pg, in + (0 * 16) + (i) + 0); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 0), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 1), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 16); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 2), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 3), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 4), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 32); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 5), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 6), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 7), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 48); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 8), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 9), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 64); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 10), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 11), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 12), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 80); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 13), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 14), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 15), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 96); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 16), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 17), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 112); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 18), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 19), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 20), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 128); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 21), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 22), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 23), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 144); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 24), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 25), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 160); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 26), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 27), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 28), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 176); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 29), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 30), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 31), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 192); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 32), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 33), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 208); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 34), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 35), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 36), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 224); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 37), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 38), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 39), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 240); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 40), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 41), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 256); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 42), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 43), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 44), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 272); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 45), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 46), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 47), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 288); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 48), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 49), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 304); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 50), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 51), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 52), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 320); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 53), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 54), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 55), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 336); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 56), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 57), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 352); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 58), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 59), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 60), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 368); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 61), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 62), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 63), tmp_dbl); + i += svcntd(); + pg = svwhilelt_b64(i, (1024LL / 64)); + } + while (svptest_any(svptrue_b64(), pg)); + } + static void falp_25bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] svuint64_t register_0; + [[maybe_unused]] svuint64_t tmp_0; + svbool_t pg = svwhilelt_b64(static_cast(0LL), (1024LL / 64)); + int64_t i = 0; + [[maybe_unused]] svuint64_t base_0; + do + { + register_0 = svld1(pg, in + (0 * 16) + (i) + 0); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 0), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 25), svdup_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 1), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 50), svdup_u64((1ULL << 14) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 16); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 11) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 2), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 11), svdup_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 3), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 4), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 61), svdup_u64((1ULL << 3) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 32); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 22) - 1)) ,3), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 5), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 22), svdup_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 6), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 47), svdup_u64((1ULL << 17) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 48); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,17), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 7), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 8), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 33), svdup_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 9), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 58), svdup_u64((1ULL << 6) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 64); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 19) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 10), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 19), svdup_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 11), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 20) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 80); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 5) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 12), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 5), svdup_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 13), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 30), svdup_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 14), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 55), svdup_u64((1ULL << 9) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 96); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,9), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 15), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 16), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 41), svdup_u64((1ULL << 23) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 112); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 2) - 1)) ,23), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 17), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 2), svdup_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 18), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 27), svdup_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 19), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 128); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 13) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 20), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 13), svdup_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 21), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 38), svdup_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 22), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 63), svdup_u64((1ULL << 1) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 144); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,1), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 23), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 24), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 49), svdup_u64((1ULL << 15) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 160); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 10) - 1)) ,15), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 25), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 10), svdup_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 26), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 35), svdup_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 27), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 176); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 21) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 28), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 21), svdup_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 29), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 46), svdup_u64((1ULL << 18) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 192); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 7) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 30), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 7), svdup_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 31), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 32), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 57), svdup_u64((1ULL << 7) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 208); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 18) - 1)) ,7), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 33), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 18), svdup_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 34), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 43), svdup_u64((1ULL << 21) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 224); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,21), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 35), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 36), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 29), svdup_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 37), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 54), svdup_u64((1ULL << 10) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 240); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 15) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 38), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 15), svdup_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 39), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 256); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 1) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 40), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 1), svdup_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 41), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 26), svdup_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 42), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 51), svdup_u64((1ULL << 13) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 272); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,13), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 43), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 44), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 37), svdup_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 45), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 62), svdup_u64((1ULL << 2) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 288); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 23) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 46), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 23), svdup_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 47), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 304); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 9) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 48), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 9), svdup_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 49), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 34), svdup_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 50), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 59), svdup_u64((1ULL << 5) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 320); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 20) - 1)) ,5), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 51), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 52), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 45), svdup_u64((1ULL << 19) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 336); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 6) - 1)) ,19), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 53), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 6), svdup_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 54), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 31), svdup_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 55), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 352); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 17) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 56), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 17), svdup_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 57), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 42), svdup_u64((1ULL << 22) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 368); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 3) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 58), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 3), svdup_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 59), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 60), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 53), svdup_u64((1ULL << 11) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 384); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 14) - 1)) ,11), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 61), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 14), svdup_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 62), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 39), svdup_u64((1ULL << 25) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 63), tmp_dbl); + i += svcntd(); + pg = svwhilelt_b64(i, (1024LL / 64)); + } + while (svptest_any(svptrue_b64(), pg)); + } + static void falp_26bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] svuint64_t register_0; + [[maybe_unused]] svuint64_t tmp_0; + svbool_t pg = svwhilelt_b64(static_cast(0LL), (1024LL / 64)); + int64_t i = 0; + [[maybe_unused]] svuint64_t base_0; + do + { + register_0 = svld1(pg, in + (0 * 16) + (i) + 0); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 0), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 26), svdup_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 1), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 16); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 14) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 2), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 14), svdup_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 3), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 32); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 2) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 4), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 2), svdup_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 5), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 6), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 54), svdup_u64((1ULL << 10) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 48); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 7), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 8), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 42), svdup_u64((1ULL << 22) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 64); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 9), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 10), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 30), svdup_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 11), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 80); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 18) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 12), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 18), svdup_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 13), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 20) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 96); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 6) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 14), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 6), svdup_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 15), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 16), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 58), svdup_u64((1ULL << 6) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 112); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 20) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 17), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 18), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 46), svdup_u64((1ULL << 18) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 128); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 19), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 20), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 34), svdup_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 21), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 144); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 22) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 22), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 22), svdup_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 23), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 160); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 10) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 24), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 10), svdup_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 25), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 26), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 62), svdup_u64((1ULL << 2) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 176); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 27), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 28), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 50), svdup_u64((1ULL << 14) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 192); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 29), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 30), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 38), svdup_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 31), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 208); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 32), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 26), svdup_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 33), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 224); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 14) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 34), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 14), svdup_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 35), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 240); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 2) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 36), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 2), svdup_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 37), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 38), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 54), svdup_u64((1ULL << 10) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 256); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 39), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 40), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 42), svdup_u64((1ULL << 22) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 272); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 41), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 42), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 30), svdup_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 43), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 288); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 18) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 44), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 18), svdup_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 45), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 20) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 304); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 6) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 46), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 6), svdup_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 47), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 48), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 58), svdup_u64((1ULL << 6) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 320); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 20) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 49), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 50), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 46), svdup_u64((1ULL << 18) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 336); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 51), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 52), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 34), svdup_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 53), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 352); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 22) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 54), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 22), svdup_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 55), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 368); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 10) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 56), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 10), svdup_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 57), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 58), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 62), svdup_u64((1ULL << 2) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 384); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 59), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 60), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 50), svdup_u64((1ULL << 14) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 400); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 61), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 62), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 38), svdup_u64((1ULL << 26) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 63), tmp_dbl); + i += svcntd(); + pg = svwhilelt_b64(i, (1024LL / 64)); + } + while (svptest_any(svptrue_b64(), pg)); + } + static void falp_27bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] svuint64_t register_0; + [[maybe_unused]] svuint64_t tmp_0; + svbool_t pg = svwhilelt_b64(static_cast(0LL), (1024LL / 64)); + int64_t i = 0; + [[maybe_unused]] svuint64_t base_0; + do + { + register_0 = svld1(pg, in + (0 * 16) + (i) + 0); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 0), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 27), svdup_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 1), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 54), svdup_u64((1ULL << 10) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 16); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 17) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 2), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 17), svdup_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 3), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 20) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 32); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 7) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 4), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 7), svdup_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 5), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 34), svdup_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 6), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 61), svdup_u64((1ULL << 3) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 48); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,3), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 7), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 8), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 51), svdup_u64((1ULL << 13) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 64); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 14) - 1)) ,13), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 9), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 14), svdup_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 10), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 41), svdup_u64((1ULL << 23) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 80); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,23), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 11), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 12), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 31), svdup_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 13), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 58), svdup_u64((1ULL << 6) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 96); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 21) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 14), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 21), svdup_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 15), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 112); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 11) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 16), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 11), svdup_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 17), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 38), svdup_u64((1ULL << 26) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 128); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 1) - 1)) ,26), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 18), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 1), svdup_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 19), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 20), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 55), svdup_u64((1ULL << 9) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 144); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 18) - 1)) ,9), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 21), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 18), svdup_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 22), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 45), svdup_u64((1ULL << 19) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 160); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,19), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 23), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 24), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 35), svdup_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 25), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 62), svdup_u64((1ULL << 2) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 176); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 25) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 26), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 25), svdup_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 27), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 192); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 15) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 28), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 15), svdup_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 29), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 42), svdup_u64((1ULL << 22) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 208); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 5) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 30), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 5), svdup_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 31), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 32), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 59), svdup_u64((1ULL << 5) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 224); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 22) - 1)) ,5), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 33), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 22), svdup_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 34), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 49), svdup_u64((1ULL << 15) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 240); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,15), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 35), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 36), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 39), svdup_u64((1ULL << 25) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 256); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 2) - 1)) ,25), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 37), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 2), svdup_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 38), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 29), svdup_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 39), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 272); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 19) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 40), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 19), svdup_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 41), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 46), svdup_u64((1ULL << 18) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 288); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 9) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 42), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 9), svdup_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 43), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 44), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 63), svdup_u64((1ULL << 1) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 304); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 26) - 1)) ,1), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 45), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 26), svdup_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 46), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 53), svdup_u64((1ULL << 11) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 320); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,11), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 47), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 48), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 43), svdup_u64((1ULL << 21) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 336); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 6) - 1)) ,21), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 49), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 6), svdup_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 50), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 33), svdup_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 51), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 352); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 23) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 52), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 23), svdup_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 53), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 50), svdup_u64((1ULL << 14) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 368); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 13) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 54), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 13), svdup_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 55), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 384); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 3) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 56), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 3), svdup_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 57), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 30), svdup_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 58), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 57), svdup_u64((1ULL << 7) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 400); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 20) - 1)) ,7), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 59), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 60), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 47), svdup_u64((1ULL << 17) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 416); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 10) - 1)) ,17), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 61), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 10), svdup_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 62), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 37), svdup_u64((1ULL << 27) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 63), tmp_dbl); + i += svcntd(); + pg = svwhilelt_b64(i, (1024LL / 64)); + } + while (svptest_any(svptrue_b64(), pg)); + } + static void falp_28bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] svuint64_t register_0; + [[maybe_unused]] svuint64_t tmp_0; + svbool_t pg = svwhilelt_b64(static_cast(0LL), (1024LL / 64)); + int64_t i = 0; + [[maybe_unused]] svuint64_t base_0; + do + { + register_0 = svld1(pg, in + (0 * 16) + (i) + 0); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 0), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 1), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 16); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 20) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 2), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 3), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 32); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 4), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 5), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 48); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 6), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 7), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 8), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 64); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 9), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 10), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 80); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 11), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 12), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 20) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 96); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 13), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 14), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 15), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 112); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 16), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 17), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 128); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 20) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 18), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 19), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 144); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 20), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 21), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 160); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 22), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 23), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 24), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 176); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 25), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 26), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 192); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 27), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 28), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 20) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 208); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 29), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 30), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 31), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 224); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 32), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 33), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 240); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 20) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 34), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 35), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 256); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 36), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 37), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 272); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 38), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 39), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 40), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 288); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 41), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 42), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 304); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 43), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 44), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 20) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 320); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 45), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 46), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 47), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 336); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 48), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 49), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 352); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 20) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 50), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 51), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 368); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 52), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 53), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 384); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 54), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 55), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 56), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 400); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 57), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 58), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 416); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 59), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 60), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 20) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 432); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 61), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 62), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 28) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 63), tmp_dbl); + i += svcntd(); + pg = svwhilelt_b64(i, (1024LL / 64)); + } + while (svptest_any(svptrue_b64(), pg)); + } + static void falp_29bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] svuint64_t register_0; + [[maybe_unused]] svuint64_t tmp_0; + svbool_t pg = svwhilelt_b64(static_cast(0LL), (1024LL / 64)); + int64_t i = 0; + [[maybe_unused]] svuint64_t base_0; + do + { + register_0 = svld1(pg, in + (0 * 16) + (i) + 0); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 29) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 0), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 29), svdup_u64((1ULL << 29) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 1), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 58), svdup_u64((1ULL << 6) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 16); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 23) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 2), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 23), svdup_u64((1ULL << 29) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 3), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 32); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 17) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 4), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 17), svdup_u64((1ULL << 29) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 5), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 46), svdup_u64((1ULL << 18) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 48); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 11) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 6), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 11), svdup_u64((1ULL << 29) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 7), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 64); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 5) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 8), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 5), svdup_u64((1ULL << 29) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 9), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 34), svdup_u64((1ULL << 29) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 10), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 63), svdup_u64((1ULL << 1) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 80); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 28) - 1)) ,1), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 11), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 29) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 12), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 57), svdup_u64((1ULL << 7) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 96); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 22) - 1)) ,7), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 13), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 22), svdup_u64((1ULL << 29) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 14), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 51), svdup_u64((1ULL << 13) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 112); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,13), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 15), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 29) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 16), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 45), svdup_u64((1ULL << 19) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 128); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 10) - 1)) ,19), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 17), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 10), svdup_u64((1ULL << 29) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 18), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 39), svdup_u64((1ULL << 25) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 144); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,25), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 19), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 29) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 20), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 33), svdup_u64((1ULL << 29) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 21), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 62), svdup_u64((1ULL << 2) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 160); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 27) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 22), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 27), svdup_u64((1ULL << 29) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 23), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 176); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 21) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 24), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 21), svdup_u64((1ULL << 29) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 25), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 50), svdup_u64((1ULL << 14) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 192); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 15) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 26), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 15), svdup_u64((1ULL << 29) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 27), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 20) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 208); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 9) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 28), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 9), svdup_u64((1ULL << 29) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 29), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 38), svdup_u64((1ULL << 26) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 224); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 3) - 1)) ,26), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 30), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 3), svdup_u64((1ULL << 29) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 31), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 29) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 32), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 61), svdup_u64((1ULL << 3) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 240); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 26) - 1)) ,3), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 33), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 26), svdup_u64((1ULL << 29) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 34), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 55), svdup_u64((1ULL << 9) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 256); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 20) - 1)) ,9), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 35), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 29) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 36), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 49), svdup_u64((1ULL << 15) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 272); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 14) - 1)) ,15), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 37), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 14), svdup_u64((1ULL << 29) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 38), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 43), svdup_u64((1ULL << 21) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 288); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,21), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 39), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 29) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 40), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 37), svdup_u64((1ULL << 27) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 304); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 2) - 1)) ,27), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 41), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 2), svdup_u64((1ULL << 29) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 42), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 31), svdup_u64((1ULL << 29) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 43), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 320); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 25) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 44), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 25), svdup_u64((1ULL << 29) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 45), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 54), svdup_u64((1ULL << 10) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 336); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 19) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 46), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 19), svdup_u64((1ULL << 29) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 47), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 352); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 13) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 48), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 13), svdup_u64((1ULL << 29) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 49), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 42), svdup_u64((1ULL << 22) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 368); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 7) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 50), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 7), svdup_u64((1ULL << 29) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 51), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 28) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 384); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 1) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 52), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 1), svdup_u64((1ULL << 29) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 53), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 30), svdup_u64((1ULL << 29) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 54), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 59), svdup_u64((1ULL << 5) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 400); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,5), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 55), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 29) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 56), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 53), svdup_u64((1ULL << 11) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 416); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 18) - 1)) ,11), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 57), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 18), svdup_u64((1ULL << 29) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 58), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 47), svdup_u64((1ULL << 17) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 432); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,17), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 59), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 29) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 60), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 41), svdup_u64((1ULL << 23) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 448); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 6) - 1)) ,23), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 61), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 6), svdup_u64((1ULL << 29) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 62), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 35), svdup_u64((1ULL << 29) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 63), tmp_dbl); + i += svcntd(); + pg = svwhilelt_b64(i, (1024LL / 64)); + } + while (svptest_any(svptrue_b64(), pg)); + } + static void falp_30bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] svuint64_t register_0; + [[maybe_unused]] svuint64_t tmp_0; + svbool_t pg = svwhilelt_b64(static_cast(0LL), (1024LL / 64)); + int64_t i = 0; + [[maybe_unused]] svuint64_t base_0; + do + { + register_0 = svld1(pg, in + (0 * 16) + (i) + 0); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 30) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 0), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 30), svdup_u64((1ULL << 30) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 1), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 16); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 26) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 2), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 26), svdup_u64((1ULL << 30) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 3), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 32); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 22) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 4), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 22), svdup_u64((1ULL << 30) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 5), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 48); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 18) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 6), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 18), svdup_u64((1ULL << 30) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 7), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 64); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 14) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 8), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 14), svdup_u64((1ULL << 30) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 9), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 20) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 80); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 10) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 10), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 10), svdup_u64((1ULL << 30) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 11), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 96); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 6) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 12), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 6), svdup_u64((1ULL << 30) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 13), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 28) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 112); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 2) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 14), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 2), svdup_u64((1ULL << 30) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 15), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 30) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 16), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 62), svdup_u64((1ULL << 2) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 128); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 28) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 17), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 30) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 18), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 58), svdup_u64((1ULL << 6) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 144); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 19), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 30) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 20), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 54), svdup_u64((1ULL << 10) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 160); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 20) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 21), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 30) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 22), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 50), svdup_u64((1ULL << 14) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 176); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 23), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 30) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 24), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 46), svdup_u64((1ULL << 18) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 192); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 25), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 30) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 26), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 42), svdup_u64((1ULL << 22) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 208); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 27), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 30) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 28), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 38), svdup_u64((1ULL << 26) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 224); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,26), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 29), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 30) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 30), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 34), svdup_u64((1ULL << 30) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 31), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 240); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 30) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 32), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 30), svdup_u64((1ULL << 30) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 33), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 256); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 26) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 34), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 26), svdup_u64((1ULL << 30) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 35), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 272); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 22) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 36), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 22), svdup_u64((1ULL << 30) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 37), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 288); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 18) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 38), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 18), svdup_u64((1ULL << 30) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 39), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 304); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 14) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 40), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 14), svdup_u64((1ULL << 30) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 41), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 20) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 320); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 10) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 42), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 10), svdup_u64((1ULL << 30) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 43), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 336); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 6) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 44), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 6), svdup_u64((1ULL << 30) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 45), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 28) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 352); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 2) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 46), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 2), svdup_u64((1ULL << 30) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 47), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 30) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 48), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 62), svdup_u64((1ULL << 2) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 368); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 28) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 49), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 30) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 50), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 58), svdup_u64((1ULL << 6) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 384); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 51), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 30) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 52), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 54), svdup_u64((1ULL << 10) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 400); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 20) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 53), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 30) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 54), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 50), svdup_u64((1ULL << 14) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 416); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 55), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 30) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 56), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 46), svdup_u64((1ULL << 18) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 432); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 57), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 30) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 58), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 42), svdup_u64((1ULL << 22) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 448); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 59), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 30) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 60), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 38), svdup_u64((1ULL << 26) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 464); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,26), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 61), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 30) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 62), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 34), svdup_u64((1ULL << 30) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 63), tmp_dbl); + i += svcntd(); + pg = svwhilelt_b64(i, (1024LL / 64)); + } + while (svptest_any(svptrue_b64(), pg)); + } + static void falp_31bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] svuint64_t register_0; + [[maybe_unused]] svuint64_t tmp_0; + svbool_t pg = svwhilelt_b64(static_cast(0LL), (1024LL / 64)); + int64_t i = 0; + [[maybe_unused]] svuint64_t base_0; + do + { + register_0 = svld1(pg, in + (0 * 16) + (i) + 0); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 31) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 0), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 31), svdup_u64((1ULL << 31) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 1), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 62), svdup_u64((1ULL << 2) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 16); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 29) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 2), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 29), svdup_u64((1ULL << 31) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 3), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 32); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 27) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 4), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 27), svdup_u64((1ULL << 31) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 5), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 58), svdup_u64((1ULL << 6) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 48); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 25) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 6), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 25), svdup_u64((1ULL << 31) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 7), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 64); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 23) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 8), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 23), svdup_u64((1ULL << 31) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 9), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 54), svdup_u64((1ULL << 10) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 80); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 21) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 10), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 21), svdup_u64((1ULL << 31) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 11), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 96); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 19) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 12), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 19), svdup_u64((1ULL << 31) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 13), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 50), svdup_u64((1ULL << 14) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 112); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 17) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 14), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 17), svdup_u64((1ULL << 31) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 15), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 128); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 15) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 16), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 15), svdup_u64((1ULL << 31) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 17), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 46), svdup_u64((1ULL << 18) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 144); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 13) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 18), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 13), svdup_u64((1ULL << 31) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 19), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 20) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 160); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 11) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 20), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 11), svdup_u64((1ULL << 31) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 21), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 42), svdup_u64((1ULL << 22) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 176); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 9) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 22), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 9), svdup_u64((1ULL << 31) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 23), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 192); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 7) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 24), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 7), svdup_u64((1ULL << 31) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 25), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 38), svdup_u64((1ULL << 26) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 208); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 5) - 1)) ,26), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 26), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 5), svdup_u64((1ULL << 31) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 27), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 28) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 224); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 3) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 28), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 3), svdup_u64((1ULL << 31) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 29), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 34), svdup_u64((1ULL << 30) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 240); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 1) - 1)) ,30), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 30), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 1), svdup_u64((1ULL << 31) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 31), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 31) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 32), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 63), svdup_u64((1ULL << 1) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 256); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 30) - 1)) ,1), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 33), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 30), svdup_u64((1ULL << 31) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 34), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 61), svdup_u64((1ULL << 3) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 272); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 28) - 1)) ,3), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 35), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 31) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 36), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 59), svdup_u64((1ULL << 5) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 288); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 26) - 1)) ,5), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 37), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 26), svdup_u64((1ULL << 31) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 38), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 57), svdup_u64((1ULL << 7) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 304); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,7), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 39), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 31) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 40), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 55), svdup_u64((1ULL << 9) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 320); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 22) - 1)) ,9), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 41), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 22), svdup_u64((1ULL << 31) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 42), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 53), svdup_u64((1ULL << 11) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 336); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 20) - 1)) ,11), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 43), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 31) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 44), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 51), svdup_u64((1ULL << 13) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 352); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 18) - 1)) ,13), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 45), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 18), svdup_u64((1ULL << 31) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 46), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 49), svdup_u64((1ULL << 15) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 368); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,15), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 47), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 31) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 48), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 47), svdup_u64((1ULL << 17) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 384); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 14) - 1)) ,17), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 49), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 14), svdup_u64((1ULL << 31) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 50), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 45), svdup_u64((1ULL << 19) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 400); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,19), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 51), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 31) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 52), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 43), svdup_u64((1ULL << 21) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 416); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 10) - 1)) ,21), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 53), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 10), svdup_u64((1ULL << 31) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 54), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 41), svdup_u64((1ULL << 23) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 432); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,23), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 55), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 31) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 56), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 39), svdup_u64((1ULL << 25) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 448); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 6) - 1)) ,25), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 57), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 6), svdup_u64((1ULL << 31) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 58), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 37), svdup_u64((1ULL << 27) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 464); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,27), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 59), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 31) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 60), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 35), svdup_u64((1ULL << 29) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 480); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 2) - 1)) ,29), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 61), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 2), svdup_u64((1ULL << 31) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 62), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 33), svdup_u64((1ULL << 31) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 63), tmp_dbl); + i += svcntd(); + pg = svwhilelt_b64(i, (1024LL / 64)); + } + while (svptest_any(svptrue_b64(), pg)); + } + static void falp_32bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] svuint64_t register_0; + [[maybe_unused]] svuint64_t tmp_0; + svbool_t pg = svwhilelt_b64(static_cast(0LL), (1024LL / 64)); + int64_t i = 0; + [[maybe_unused]] svuint64_t base_0; + do + { + register_0 = svld1(pg, in + (0 * 16) + (i) + 0); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 0), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 1), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 16); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 2), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 3), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 32); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 4), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 5), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 48); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 6), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 7), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 64); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 8), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 9), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 80); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 10), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 11), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 96); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 12), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 13), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 112); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 14), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 15), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 128); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 16), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 17), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 144); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 18), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 19), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 160); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 20), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 21), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 176); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 22), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 23), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 192); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 24), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 25), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 208); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 26), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 27), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 224); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 28), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 29), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 240); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 30), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 31), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 256); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 32), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 33), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 272); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 34), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 35), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 288); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 36), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 37), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 304); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 38), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 39), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 320); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 40), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 41), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 336); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 42), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 43), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 352); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 44), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 45), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 368); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 46), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 47), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 384); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 48), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 49), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 400); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 50), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 51), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 416); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 52), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 53), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 432); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 54), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 55), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 448); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 56), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 57), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 464); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 58), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 59), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 480); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 60), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 61), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 496); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 62), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 63), tmp_dbl); + i += svcntd(); + pg = svwhilelt_b64(i, (1024LL / 64)); + } + while (svptest_any(svptrue_b64(), pg)); + } + static void falp_33bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] svuint64_t register_0; + [[maybe_unused]] svuint64_t tmp_0; + svbool_t pg = svwhilelt_b64(static_cast(0LL), (1024LL / 64)); + int64_t i = 0; + [[maybe_unused]] svuint64_t base_0; + do + { + register_0 = svld1(pg, in + (0 * 16) + (i) + 0); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 33) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 0), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 33), svdup_u64((1ULL << 31) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 16); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 2) - 1)) ,31), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 1), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 2), svdup_u64((1ULL << 33) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 2), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 35), svdup_u64((1ULL << 29) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 32); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,29), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 3), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 33) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 4), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 37), svdup_u64((1ULL << 27) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 48); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 6) - 1)) ,27), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 5), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 6), svdup_u64((1ULL << 33) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 6), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 39), svdup_u64((1ULL << 25) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 64); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,25), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 7), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 33) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 8), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 41), svdup_u64((1ULL << 23) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 80); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 10) - 1)) ,23), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 9), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 10), svdup_u64((1ULL << 33) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 10), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 43), svdup_u64((1ULL << 21) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 96); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,21), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 11), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 33) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 12), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 45), svdup_u64((1ULL << 19) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 112); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 14) - 1)) ,19), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 13), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 14), svdup_u64((1ULL << 33) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 14), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 47), svdup_u64((1ULL << 17) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 128); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,17), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 15), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 33) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 16), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 49), svdup_u64((1ULL << 15) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 144); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 18) - 1)) ,15), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 17), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 18), svdup_u64((1ULL << 33) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 18), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 51), svdup_u64((1ULL << 13) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 160); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 20) - 1)) ,13), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 19), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 33) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 20), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 53), svdup_u64((1ULL << 11) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 176); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 22) - 1)) ,11), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 21), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 22), svdup_u64((1ULL << 33) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 22), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 55), svdup_u64((1ULL << 9) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 192); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,9), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 23), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 33) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 24), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 57), svdup_u64((1ULL << 7) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 208); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 26) - 1)) ,7), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 25), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 26), svdup_u64((1ULL << 33) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 26), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 59), svdup_u64((1ULL << 5) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 224); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 28) - 1)) ,5), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 27), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 33) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 28), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 61), svdup_u64((1ULL << 3) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 240); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 30) - 1)) ,3), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 29), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 30), svdup_u64((1ULL << 33) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 30), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 63), svdup_u64((1ULL << 1) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 256); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,1), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 31), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 272); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 1) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 32), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 1), svdup_u64((1ULL << 33) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 33), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 34), svdup_u64((1ULL << 30) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 288); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 3) - 1)) ,30), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 34), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 3), svdup_u64((1ULL << 33) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 35), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 28) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 304); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 5) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 36), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 5), svdup_u64((1ULL << 33) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 37), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 38), svdup_u64((1ULL << 26) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 320); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 7) - 1)) ,26), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 38), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 7), svdup_u64((1ULL << 33) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 39), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 336); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 9) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 40), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 9), svdup_u64((1ULL << 33) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 41), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 42), svdup_u64((1ULL << 22) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 352); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 11) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 42), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 11), svdup_u64((1ULL << 33) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 43), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 20) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 368); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 13) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 44), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 13), svdup_u64((1ULL << 33) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 45), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 46), svdup_u64((1ULL << 18) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 384); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 15) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 46), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 15), svdup_u64((1ULL << 33) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 47), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 400); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 17) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 48), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 17), svdup_u64((1ULL << 33) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 49), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 50), svdup_u64((1ULL << 14) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 416); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 19) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 50), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 19), svdup_u64((1ULL << 33) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 51), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 432); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 21) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 52), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 21), svdup_u64((1ULL << 33) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 53), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 54), svdup_u64((1ULL << 10) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 448); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 23) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 54), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 23), svdup_u64((1ULL << 33) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 55), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 464); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 25) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 56), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 25), svdup_u64((1ULL << 33) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 57), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 58), svdup_u64((1ULL << 6) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 480); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 27) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 58), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 27), svdup_u64((1ULL << 33) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 59), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 496); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 29) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 60), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 29), svdup_u64((1ULL << 33) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 61), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 62), svdup_u64((1ULL << 2) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 512); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 31) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 62), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 31), svdup_u64((1ULL << 33) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 63), tmp_dbl); + i += svcntd(); + pg = svwhilelt_b64(i, (1024LL / 64)); + } + while (svptest_any(svptrue_b64(), pg)); + } + static void falp_34bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] svuint64_t register_0; + [[maybe_unused]] svuint64_t tmp_0; + svbool_t pg = svwhilelt_b64(static_cast(0LL), (1024LL / 64)); + int64_t i = 0; + [[maybe_unused]] svuint64_t base_0; + do + { + register_0 = svld1(pg, in + (0 * 16) + (i) + 0); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 34) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 0), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 34), svdup_u64((1ULL << 30) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 16); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,30), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 1), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 34) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 2), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 38), svdup_u64((1ULL << 26) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 32); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,26), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 3), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 34) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 4), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 42), svdup_u64((1ULL << 22) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 48); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 5), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 34) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 6), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 46), svdup_u64((1ULL << 18) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 64); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 7), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 34) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 8), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 50), svdup_u64((1ULL << 14) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 80); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 20) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 9), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 34) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 10), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 54), svdup_u64((1ULL << 10) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 96); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 11), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 34) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 12), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 58), svdup_u64((1ULL << 6) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 112); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 28) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 13), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 34) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 14), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 62), svdup_u64((1ULL << 2) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 128); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 15), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 144); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 2) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 16), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 2), svdup_u64((1ULL << 34) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 17), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 28) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 160); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 6) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 18), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 6), svdup_u64((1ULL << 34) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 19), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 176); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 10) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 20), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 10), svdup_u64((1ULL << 34) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 21), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 20) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 192); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 14) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 22), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 14), svdup_u64((1ULL << 34) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 23), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 208); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 18) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 24), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 18), svdup_u64((1ULL << 34) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 25), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 224); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 22) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 26), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 22), svdup_u64((1ULL << 34) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 27), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 240); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 26) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 28), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 26), svdup_u64((1ULL << 34) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 29), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 256); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 30) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 30), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 30), svdup_u64((1ULL << 34) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 31), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 272); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 34) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 32), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 34), svdup_u64((1ULL << 30) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 288); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,30), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 33), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 34) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 34), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 38), svdup_u64((1ULL << 26) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 304); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,26), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 35), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 34) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 36), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 42), svdup_u64((1ULL << 22) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 320); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 37), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 34) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 38), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 46), svdup_u64((1ULL << 18) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 336); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 39), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 34) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 40), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 50), svdup_u64((1ULL << 14) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 352); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 20) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 41), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 34) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 42), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 54), svdup_u64((1ULL << 10) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 368); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 43), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 34) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 44), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 58), svdup_u64((1ULL << 6) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 384); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 28) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 45), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 34) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 46), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 62), svdup_u64((1ULL << 2) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 400); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 47), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 416); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 2) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 48), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 2), svdup_u64((1ULL << 34) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 49), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 28) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 432); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 6) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 50), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 6), svdup_u64((1ULL << 34) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 51), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 448); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 10) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 52), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 10), svdup_u64((1ULL << 34) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 53), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 20) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 464); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 14) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 54), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 14), svdup_u64((1ULL << 34) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 55), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 480); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 18) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 56), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 18), svdup_u64((1ULL << 34) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 57), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 496); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 22) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 58), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 22), svdup_u64((1ULL << 34) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 59), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 512); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 26) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 60), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 26), svdup_u64((1ULL << 34) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 61), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 528); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 30) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 62), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 30), svdup_u64((1ULL << 34) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 63), tmp_dbl); + i += svcntd(); + pg = svwhilelt_b64(i, (1024LL / 64)); + } + while (svptest_any(svptrue_b64(), pg)); + } + static void falp_35bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] svuint64_t register_0; + [[maybe_unused]] svuint64_t tmp_0; + svbool_t pg = svwhilelt_b64(static_cast(0LL), (1024LL / 64)); + int64_t i = 0; + [[maybe_unused]] svuint64_t base_0; + do + { + register_0 = svld1(pg, in + (0 * 16) + (i) + 0); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 35) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 0), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 35), svdup_u64((1ULL << 29) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 16); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 6) - 1)) ,29), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 1), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 6), svdup_u64((1ULL << 35) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 2), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 41), svdup_u64((1ULL << 23) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 32); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,23), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 3), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 35) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 4), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 47), svdup_u64((1ULL << 17) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 48); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 18) - 1)) ,17), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 5), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 18), svdup_u64((1ULL << 35) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 6), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 53), svdup_u64((1ULL << 11) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 64); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,11), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 7), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 35) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 8), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 59), svdup_u64((1ULL << 5) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 80); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 30) - 1)) ,5), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 9), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 30), svdup_u64((1ULL << 34) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 96); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 1) - 1)) ,34), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 10), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 1), svdup_u64((1ULL << 35) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 11), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 28) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 112); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 7) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 12), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 7), svdup_u64((1ULL << 35) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 13), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 42), svdup_u64((1ULL << 22) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 128); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 13) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 14), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 13), svdup_u64((1ULL << 35) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 15), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 144); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 19) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 16), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 19), svdup_u64((1ULL << 35) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 17), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 54), svdup_u64((1ULL << 10) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 160); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 25) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 18), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 25), svdup_u64((1ULL << 35) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 19), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 176); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 31) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 20), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 31), svdup_u64((1ULL << 33) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 192); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 2) - 1)) ,33), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 21), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 2), svdup_u64((1ULL << 35) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 22), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 37), svdup_u64((1ULL << 27) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 208); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,27), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 23), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 35) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 24), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 43), svdup_u64((1ULL << 21) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 224); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 14) - 1)) ,21), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 25), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 14), svdup_u64((1ULL << 35) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 26), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 49), svdup_u64((1ULL << 15) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 240); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 20) - 1)) ,15), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 27), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 35) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 28), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 55), svdup_u64((1ULL << 9) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 256); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 26) - 1)) ,9), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 29), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 26), svdup_u64((1ULL << 35) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 30), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 61), svdup_u64((1ULL << 3) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 272); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,3), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 31), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 288); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 3) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 32), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 3), svdup_u64((1ULL << 35) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 33), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 38), svdup_u64((1ULL << 26) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 304); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 9) - 1)) ,26), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 34), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 9), svdup_u64((1ULL << 35) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 35), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 20) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 320); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 15) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 36), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 15), svdup_u64((1ULL << 35) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 37), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 50), svdup_u64((1ULL << 14) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 336); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 21) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 38), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 21), svdup_u64((1ULL << 35) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 39), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 352); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 27) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 40), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 27), svdup_u64((1ULL << 35) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 41), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 62), svdup_u64((1ULL << 2) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 368); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 33) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 42), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 33), svdup_u64((1ULL << 31) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 384); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,31), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 43), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 35) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 44), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 39), svdup_u64((1ULL << 25) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 400); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 10) - 1)) ,25), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 45), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 10), svdup_u64((1ULL << 35) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 46), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 45), svdup_u64((1ULL << 19) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 416); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,19), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 47), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 35) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 48), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 51), svdup_u64((1ULL << 13) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 432); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 22) - 1)) ,13), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 49), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 22), svdup_u64((1ULL << 35) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 50), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 57), svdup_u64((1ULL << 7) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 448); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 28) - 1)) ,7), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 51), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 35) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 52), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 63), svdup_u64((1ULL << 1) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 464); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 34) - 1)) ,1), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 53), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 34), svdup_u64((1ULL << 30) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 480); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 5) - 1)) ,30), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 54), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 5), svdup_u64((1ULL << 35) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 55), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 496); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 11) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 56), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 11), svdup_u64((1ULL << 35) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 57), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 46), svdup_u64((1ULL << 18) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 512); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 17) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 58), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 17), svdup_u64((1ULL << 35) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 59), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 528); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 23) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 60), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 23), svdup_u64((1ULL << 35) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 61), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 58), svdup_u64((1ULL << 6) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 544); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 29) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 62), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 29), svdup_u64((1ULL << 35) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 63), tmp_dbl); + i += svcntd(); + pg = svwhilelt_b64(i, (1024LL / 64)); + } + while (svptest_any(svptrue_b64(), pg)); + } + static void falp_36bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] svuint64_t register_0; + [[maybe_unused]] svuint64_t tmp_0; + svbool_t pg = svwhilelt_b64(static_cast(0LL), (1024LL / 64)); + int64_t i = 0; + [[maybe_unused]] svuint64_t base_0; + do + { + register_0 = svld1(pg, in + (0 * 16) + (i) + 0); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 36) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 0), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 28) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 16); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 1), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 36) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 2), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 20) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 32); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 3), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 36) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 4), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 48); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 5), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 36) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 6), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 64); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 7), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 80); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 8), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 36) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 9), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 96); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 10), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 36) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 11), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 112); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 20) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 12), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 36) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 13), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 128); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 28) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 14), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 36) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 15), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 144); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 36) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 16), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 28) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 160); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 17), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 36) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 18), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 20) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 176); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 19), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 36) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 20), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 192); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 21), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 36) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 22), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 208); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 23), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 224); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 24), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 36) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 25), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 240); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 26), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 36) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 27), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 256); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 20) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 28), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 36) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 29), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 272); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 28) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 30), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 36) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 31), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 288); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 36) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 32), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 28) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 304); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 33), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 36) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 34), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 20) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 320); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 35), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 36) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 36), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 336); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 37), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 36) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 38), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 352); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 39), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 368); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 40), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 36) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 41), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 384); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 42), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 36) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 43), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 400); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 20) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 44), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 36) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 45), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 416); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 28) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 46), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 36) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 47), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 432); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 36) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 48), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 28) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 448); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 49), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 36) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 50), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 20) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 464); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 51), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 36) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 52), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 480); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 53), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 36) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 54), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 496); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 55), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 512); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 56), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 36) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 57), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 528); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 58), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 36) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 59), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 544); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 20) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 60), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 36) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 61), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 560); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 28) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 62), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 36) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 63), tmp_dbl); + i += svcntd(); + pg = svwhilelt_b64(i, (1024LL / 64)); + } + while (svptest_any(svptrue_b64(), pg)); + } + static void falp_37bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] svuint64_t register_0; + [[maybe_unused]] svuint64_t tmp_0; + svbool_t pg = svwhilelt_b64(static_cast(0LL), (1024LL / 64)); + int64_t i = 0; + [[maybe_unused]] svuint64_t base_0; + do + { + register_0 = svld1(pg, in + (0 * 16) + (i) + 0); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 37) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 0), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 37), svdup_u64((1ULL << 27) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 16); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 10) - 1)) ,27), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 1), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 10), svdup_u64((1ULL << 37) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 2), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 47), svdup_u64((1ULL << 17) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 32); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 20) - 1)) ,17), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 3), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 37) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 4), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 57), svdup_u64((1ULL << 7) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 48); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 30) - 1)) ,7), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 5), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 30), svdup_u64((1ULL << 34) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 64); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 3) - 1)) ,34), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 6), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 3), svdup_u64((1ULL << 37) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 7), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 80); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 13) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 8), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 13), svdup_u64((1ULL << 37) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 9), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 50), svdup_u64((1ULL << 14) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 96); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 23) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 10), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 23), svdup_u64((1ULL << 37) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 11), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 112); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 33) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 12), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 33), svdup_u64((1ULL << 31) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 128); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 6) - 1)) ,31), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 13), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 6), svdup_u64((1ULL << 37) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 14), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 43), svdup_u64((1ULL << 21) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 144); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,21), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 15), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 37) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 16), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 53), svdup_u64((1ULL << 11) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 160); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 26) - 1)) ,11), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 17), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 26), svdup_u64((1ULL << 37) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 18), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 63), svdup_u64((1ULL << 1) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 176); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 36) - 1)) ,1), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 19), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 28) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 192); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 9) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 20), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 9), svdup_u64((1ULL << 37) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 21), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 46), svdup_u64((1ULL << 18) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 208); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 19) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 22), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 19), svdup_u64((1ULL << 37) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 23), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 224); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 29) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 24), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 29), svdup_u64((1ULL << 35) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 240); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 2) - 1)) ,35), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 25), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 2), svdup_u64((1ULL << 37) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 26), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 39), svdup_u64((1ULL << 25) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 256); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,25), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 27), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 37) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 28), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 49), svdup_u64((1ULL << 15) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 272); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 22) - 1)) ,15), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 29), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 22), svdup_u64((1ULL << 37) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 30), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 59), svdup_u64((1ULL << 5) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 288); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,5), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 31), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 304); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 5) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 32), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 5), svdup_u64((1ULL << 37) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 33), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 42), svdup_u64((1ULL << 22) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 320); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 15) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 34), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 15), svdup_u64((1ULL << 37) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 35), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 336); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 25) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 36), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 25), svdup_u64((1ULL << 37) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 37), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 62), svdup_u64((1ULL << 2) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 352); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 35) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 38), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 35), svdup_u64((1ULL << 29) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 368); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,29), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 39), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 37) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 40), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 45), svdup_u64((1ULL << 19) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 384); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 18) - 1)) ,19), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 41), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 18), svdup_u64((1ULL << 37) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 42), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 55), svdup_u64((1ULL << 9) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 400); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 28) - 1)) ,9), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 43), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 36) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 416); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 1) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 44), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 1), svdup_u64((1ULL << 37) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 45), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 38), svdup_u64((1ULL << 26) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 432); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 11) - 1)) ,26), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 46), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 11), svdup_u64((1ULL << 37) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 47), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 448); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 21) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 48), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 21), svdup_u64((1ULL << 37) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 49), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 58), svdup_u64((1ULL << 6) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 464); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 31) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 50), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 31), svdup_u64((1ULL << 33) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 480); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,33), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 51), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 37) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 52), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 41), svdup_u64((1ULL << 23) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 496); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 14) - 1)) ,23), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 53), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 14), svdup_u64((1ULL << 37) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 54), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 51), svdup_u64((1ULL << 13) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 512); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,13), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 55), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 37) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 56), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 61), svdup_u64((1ULL << 3) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 528); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 34) - 1)) ,3), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 57), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 34), svdup_u64((1ULL << 30) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 544); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 7) - 1)) ,30), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 58), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 7), svdup_u64((1ULL << 37) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 59), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 20) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 560); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 17) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 60), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 17), svdup_u64((1ULL << 37) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 61), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 54), svdup_u64((1ULL << 10) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 576); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 27) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 62), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 27), svdup_u64((1ULL << 37) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 63), tmp_dbl); + i += svcntd(); + pg = svwhilelt_b64(i, (1024LL / 64)); + } + while (svptest_any(svptrue_b64(), pg)); + } + static void falp_38bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] svuint64_t register_0; + [[maybe_unused]] svuint64_t tmp_0; + svbool_t pg = svwhilelt_b64(static_cast(0LL), (1024LL / 64)); + int64_t i = 0; + [[maybe_unused]] svuint64_t base_0; + do + { + register_0 = svld1(pg, in + (0 * 16) + (i) + 0); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 38) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 0), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 38), svdup_u64((1ULL << 26) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 16); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,26), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 1), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 38) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 2), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 50), svdup_u64((1ULL << 14) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 32); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 3), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 38) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 4), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 62), svdup_u64((1ULL << 2) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 48); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 36) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 5), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 28) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 64); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 10) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 6), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 10), svdup_u64((1ULL << 38) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 7), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 80); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 22) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 8), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 22), svdup_u64((1ULL << 38) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 9), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 96); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 34) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 10), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 34), svdup_u64((1ULL << 30) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 112); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,30), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 11), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 38) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 12), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 46), svdup_u64((1ULL << 18) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 128); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 20) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 13), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 38) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 14), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 58), svdup_u64((1ULL << 6) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 144); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 15), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 160); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 6) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 16), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 6), svdup_u64((1ULL << 38) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 17), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 20) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 176); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 18) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 18), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 18), svdup_u64((1ULL << 38) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 19), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 192); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 30) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 20), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 30), svdup_u64((1ULL << 34) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 208); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,34), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 21), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 38) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 22), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 42), svdup_u64((1ULL << 22) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 224); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 23), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 38) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 24), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 54), svdup_u64((1ULL << 10) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 240); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 28) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 25), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 36) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 256); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 2) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 26), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 2), svdup_u64((1ULL << 38) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 27), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 272); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 14) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 28), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 14), svdup_u64((1ULL << 38) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 29), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 288); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 26) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 30), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 26), svdup_u64((1ULL << 38) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 31), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 304); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 38) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 32), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 38), svdup_u64((1ULL << 26) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 320); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,26), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 33), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 38) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 34), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 50), svdup_u64((1ULL << 14) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 336); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 35), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 38) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 36), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 62), svdup_u64((1ULL << 2) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 352); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 36) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 37), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 28) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 368); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 10) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 38), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 10), svdup_u64((1ULL << 38) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 39), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 384); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 22) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 40), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 22), svdup_u64((1ULL << 38) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 41), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 400); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 34) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 42), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 34), svdup_u64((1ULL << 30) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 416); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,30), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 43), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 38) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 44), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 46), svdup_u64((1ULL << 18) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 432); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 20) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 45), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 38) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 46), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 58), svdup_u64((1ULL << 6) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 448); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 47), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 464); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 6) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 48), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 6), svdup_u64((1ULL << 38) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 49), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 20) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 480); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 18) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 50), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 18), svdup_u64((1ULL << 38) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 51), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 496); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 30) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 52), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 30), svdup_u64((1ULL << 34) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 512); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,34), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 53), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 38) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 54), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 42), svdup_u64((1ULL << 22) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 528); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 55), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 38) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 56), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 54), svdup_u64((1ULL << 10) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 544); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 28) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 57), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 36) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 560); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 2) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 58), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 2), svdup_u64((1ULL << 38) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 59), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 576); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 14) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 60), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 14), svdup_u64((1ULL << 38) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 61), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 592); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 26) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 62), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 26), svdup_u64((1ULL << 38) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 63), tmp_dbl); + i += svcntd(); + pg = svwhilelt_b64(i, (1024LL / 64)); + } + while (svptest_any(svptrue_b64(), pg)); + } + static void falp_39bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] svuint64_t register_0; + [[maybe_unused]] svuint64_t tmp_0; + svbool_t pg = svwhilelt_b64(static_cast(0LL), (1024LL / 64)); + int64_t i = 0; + [[maybe_unused]] svuint64_t base_0; + do + { + register_0 = svld1(pg, in + (0 * 16) + (i) + 0); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 39) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 0), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 39), svdup_u64((1ULL << 25) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 16); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 14) - 1)) ,25), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 1), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 14), svdup_u64((1ULL << 39) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 2), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 53), svdup_u64((1ULL << 11) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 32); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 28) - 1)) ,11), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 3), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 36) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 48); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 3) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 4), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 3), svdup_u64((1ULL << 39) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 5), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 42), svdup_u64((1ULL << 22) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 64); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 17) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 6), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 17), svdup_u64((1ULL << 39) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 7), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 80); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 31) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 8), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 31), svdup_u64((1ULL << 33) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 96); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 6) - 1)) ,33), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 9), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 6), svdup_u64((1ULL << 39) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 10), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 45), svdup_u64((1ULL << 19) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 112); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 20) - 1)) ,19), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 11), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 39) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 12), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 59), svdup_u64((1ULL << 5) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 128); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 34) - 1)) ,5), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 13), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 34), svdup_u64((1ULL << 30) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 144); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 9) - 1)) ,30), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 14), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 9), svdup_u64((1ULL << 39) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 15), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 160); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 23) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 16), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 23), svdup_u64((1ULL << 39) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 17), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 62), svdup_u64((1ULL << 2) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 176); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 37) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 18), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 37), svdup_u64((1ULL << 27) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 192); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,27), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 19), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 39) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 20), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 51), svdup_u64((1ULL << 13) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 208); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 26) - 1)) ,13), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 21), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 26), svdup_u64((1ULL << 38) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 224); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 1) - 1)) ,38), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 22), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 1), svdup_u64((1ULL << 39) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 23), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 240); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 15) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 24), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 15), svdup_u64((1ULL << 39) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 25), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 54), svdup_u64((1ULL << 10) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 256); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 29) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 26), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 29), svdup_u64((1ULL << 35) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 272); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,35), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 27), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 39) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 28), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 43), svdup_u64((1ULL << 21) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 288); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 18) - 1)) ,21), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 29), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 18), svdup_u64((1ULL << 39) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 30), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 57), svdup_u64((1ULL << 7) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 304); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,7), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 31), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 320); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 7) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 32), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 7), svdup_u64((1ULL << 39) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 33), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 46), svdup_u64((1ULL << 18) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 336); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 21) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 34), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 21), svdup_u64((1ULL << 39) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 35), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 352); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 35) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 36), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 35), svdup_u64((1ULL << 29) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 368); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 10) - 1)) ,29), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 37), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 10), svdup_u64((1ULL << 39) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 38), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 49), svdup_u64((1ULL << 15) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 384); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,15), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 39), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 39) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 40), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 63), svdup_u64((1ULL << 1) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 400); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 38) - 1)) ,1), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 41), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 38), svdup_u64((1ULL << 26) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 416); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 13) - 1)) ,26), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 42), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 13), svdup_u64((1ULL << 39) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 43), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 432); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 27) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 44), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 27), svdup_u64((1ULL << 37) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 448); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 2) - 1)) ,37), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 45), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 2), svdup_u64((1ULL << 39) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 46), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 41), svdup_u64((1ULL << 23) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 464); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,23), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 47), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 39) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 48), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 55), svdup_u64((1ULL << 9) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 480); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 30) - 1)) ,9), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 49), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 30), svdup_u64((1ULL << 34) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 496); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 5) - 1)) ,34), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 50), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 5), svdup_u64((1ULL << 39) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 51), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 20) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 512); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 19) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 52), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 19), svdup_u64((1ULL << 39) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 53), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 58), svdup_u64((1ULL << 6) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 528); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 33) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 54), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 33), svdup_u64((1ULL << 31) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 544); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,31), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 55), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 39) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 56), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 47), svdup_u64((1ULL << 17) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 560); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 22) - 1)) ,17), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 57), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 22), svdup_u64((1ULL << 39) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 58), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 61), svdup_u64((1ULL << 3) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 576); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 36) - 1)) ,3), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 59), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 28) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 592); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 11) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 60), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 11), svdup_u64((1ULL << 39) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 61), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 50), svdup_u64((1ULL << 14) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 608); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 25) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 62), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 25), svdup_u64((1ULL << 39) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 63), tmp_dbl); + i += svcntd(); + pg = svwhilelt_b64(i, (1024LL / 64)); + } + while (svptest_any(svptrue_b64(), pg)); + } + static void falp_40bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] svuint64_t register_0; + [[maybe_unused]] svuint64_t tmp_0; + svbool_t pg = svwhilelt_b64(static_cast(0LL), (1024LL / 64)); + int64_t i = 0; + [[maybe_unused]] svuint64_t base_0; + do + { + register_0 = svld1(pg, in + (0 * 16) + (i) + 0); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 40) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 0), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 16); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 1), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 40) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 2), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 32); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 3), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 48); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 4), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 40) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 5), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 64); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 6), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 40) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 7), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 80); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 40) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 8), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 96); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 9), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 40) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 10), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 112); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 11), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 128); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 12), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 40) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 13), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 144); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 14), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 40) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 15), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 160); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 40) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 16), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 176); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 17), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 40) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 18), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 192); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 19), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 208); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 20), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 40) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 21), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 224); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 22), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 40) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 23), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 240); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 40) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 24), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 256); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 25), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 40) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 26), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 272); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 27), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 288); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 28), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 40) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 29), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 304); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 30), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 40) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 31), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 320); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 40) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 32), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 336); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 33), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 40) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 34), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 352); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 35), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 368); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 36), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 40) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 37), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 384); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 38), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 40) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 39), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 400); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 40) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 40), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 416); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 41), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 40) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 42), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 432); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 43), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 448); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 44), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 40) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 45), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 464); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 46), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 40) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 47), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 480); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 40) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 48), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 496); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 49), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 40) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 50), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 512); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 51), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 528); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 52), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 40) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 53), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 544); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 54), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 40) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 55), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 560); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 40) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 56), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 576); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 57), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 40) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 58), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 592); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 59), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 608); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 60), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 40) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 61), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 624); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 62), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 40) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 63), tmp_dbl); + i += svcntd(); + pg = svwhilelt_b64(i, (1024LL / 64)); + } + while (svptest_any(svptrue_b64(), pg)); + } + static void falp_41bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] svuint64_t register_0; + [[maybe_unused]] svuint64_t tmp_0; + svbool_t pg = svwhilelt_b64(static_cast(0LL), (1024LL / 64)); + int64_t i = 0; + [[maybe_unused]] svuint64_t base_0; + do + { + register_0 = svld1(pg, in + (0 * 16) + (i) + 0); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 41) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 0), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 41), svdup_u64((1ULL << 23) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 16); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 18) - 1)) ,23), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 1), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 18), svdup_u64((1ULL << 41) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 2), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 59), svdup_u64((1ULL << 5) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 32); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 36) - 1)) ,5), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 3), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 28) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 48); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 13) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 4), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 13), svdup_u64((1ULL << 41) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 5), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 54), svdup_u64((1ULL << 10) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 64); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 31) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 6), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 31), svdup_u64((1ULL << 33) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 80); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,33), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 7), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 41) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 8), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 49), svdup_u64((1ULL << 15) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 96); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 26) - 1)) ,15), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 9), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 26), svdup_u64((1ULL << 38) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 112); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 3) - 1)) ,38), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 10), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 3), svdup_u64((1ULL << 41) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 11), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 20) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 128); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 21) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 12), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 21), svdup_u64((1ULL << 41) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 13), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 62), svdup_u64((1ULL << 2) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 144); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 39) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 14), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 39), svdup_u64((1ULL << 25) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 160); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,25), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 15), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 41) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 16), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 57), svdup_u64((1ULL << 7) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 176); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 34) - 1)) ,7), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 17), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 34), svdup_u64((1ULL << 30) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 192); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 11) - 1)) ,30), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 18), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 11), svdup_u64((1ULL << 41) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 19), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 208); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 29) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 20), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 29), svdup_u64((1ULL << 35) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 224); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 6) - 1)) ,35), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 21), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 6), svdup_u64((1ULL << 41) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 22), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 47), svdup_u64((1ULL << 17) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 240); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,17), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 23), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 40) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 256); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 1) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 24), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 1), svdup_u64((1ULL << 41) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 25), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 42), svdup_u64((1ULL << 22) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 272); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 19) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 26), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 19), svdup_u64((1ULL << 41) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 27), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 288); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 37) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 28), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 37), svdup_u64((1ULL << 27) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 304); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 14) - 1)) ,27), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 29), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 14), svdup_u64((1ULL << 41) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 30), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 55), svdup_u64((1ULL << 9) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 320); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,9), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 31), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 336); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 9) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 32), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 9), svdup_u64((1ULL << 41) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 33), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 50), svdup_u64((1ULL << 14) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 352); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 27) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 34), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 27), svdup_u64((1ULL << 37) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 368); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,37), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 35), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 41) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 36), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 45), svdup_u64((1ULL << 19) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 384); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 22) - 1)) ,19), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 37), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 22), svdup_u64((1ULL << 41) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 38), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 63), svdup_u64((1ULL << 1) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 400); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 40) - 1)) ,1), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 39), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 416); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 17) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 40), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 17), svdup_u64((1ULL << 41) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 41), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 58), svdup_u64((1ULL << 6) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 432); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 35) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 42), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 35), svdup_u64((1ULL << 29) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 448); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,29), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 43), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 41) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 44), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 53), svdup_u64((1ULL << 11) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 464); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 30) - 1)) ,11), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 45), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 30), svdup_u64((1ULL << 34) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 480); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 7) - 1)) ,34), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 46), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 7), svdup_u64((1ULL << 41) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 47), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 496); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 25) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 48), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 25), svdup_u64((1ULL << 39) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 512); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 2) - 1)) ,39), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 49), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 2), svdup_u64((1ULL << 41) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 50), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 43), svdup_u64((1ULL << 21) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 528); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 20) - 1)) ,21), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 51), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 41) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 52), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 61), svdup_u64((1ULL << 3) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 544); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 38) - 1)) ,3), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 53), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 38), svdup_u64((1ULL << 26) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 560); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 15) - 1)) ,26), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 54), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 15), svdup_u64((1ULL << 41) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 55), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 576); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 33) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 56), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 33), svdup_u64((1ULL << 31) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 592); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 10) - 1)) ,31), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 57), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 10), svdup_u64((1ULL << 41) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 58), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 51), svdup_u64((1ULL << 13) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 608); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 28) - 1)) ,13), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 59), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 36) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 624); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 5) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 60), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 5), svdup_u64((1ULL << 41) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 61), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 46), svdup_u64((1ULL << 18) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 640); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 23) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 62), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 23), svdup_u64((1ULL << 41) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 63), tmp_dbl); + i += svcntd(); + pg = svwhilelt_b64(i, (1024LL / 64)); + } + while (svptest_any(svptrue_b64(), pg)); + } + static void falp_42bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] svuint64_t register_0; + [[maybe_unused]] svuint64_t tmp_0; + svbool_t pg = svwhilelt_b64(static_cast(0LL), (1024LL / 64)); + int64_t i = 0; + [[maybe_unused]] svuint64_t base_0; + do + { + register_0 = svld1(pg, in + (0 * 16) + (i) + 0); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 42) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 0), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 42), svdup_u64((1ULL << 22) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 16); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 20) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 1), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 42) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 2), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 62), svdup_u64((1ULL << 2) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 32); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 40) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 3), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 48); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 18) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 4), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 18), svdup_u64((1ULL << 42) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 5), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 64); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 38) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 6), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 38), svdup_u64((1ULL << 26) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 80); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,26), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 7), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 42) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 8), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 58), svdup_u64((1ULL << 6) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 96); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 36) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 9), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 28) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 112); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 14) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 10), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 14), svdup_u64((1ULL << 42) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 11), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 128); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 34) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 12), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 34), svdup_u64((1ULL << 30) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 144); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,30), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 13), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 42) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 14), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 54), svdup_u64((1ULL << 10) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 160); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 15), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 176); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 10) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 16), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 10), svdup_u64((1ULL << 42) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 17), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 192); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 30) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 18), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 30), svdup_u64((1ULL << 34) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 208); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,34), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 19), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 42) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 20), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 50), svdup_u64((1ULL << 14) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 224); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 28) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 21), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 36) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 240); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 6) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 22), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 6), svdup_u64((1ULL << 42) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 23), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 256); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 26) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 24), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 26), svdup_u64((1ULL << 38) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 272); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,38), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 25), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 42) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 26), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 46), svdup_u64((1ULL << 18) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 288); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 27), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 40) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 304); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 2) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 28), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 2), svdup_u64((1ULL << 42) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 29), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 20) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 320); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 22) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 30), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 22), svdup_u64((1ULL << 42) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 31), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 336); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 42) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 32), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 42), svdup_u64((1ULL << 22) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 352); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 20) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 33), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 42) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 34), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 62), svdup_u64((1ULL << 2) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 368); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 40) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 35), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 384); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 18) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 36), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 18), svdup_u64((1ULL << 42) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 37), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 400); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 38) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 38), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 38), svdup_u64((1ULL << 26) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 416); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,26), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 39), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 42) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 40), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 58), svdup_u64((1ULL << 6) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 432); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 36) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 41), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 28) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 448); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 14) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 42), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 14), svdup_u64((1ULL << 42) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 43), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 464); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 34) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 44), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 34), svdup_u64((1ULL << 30) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 480); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,30), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 45), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 42) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 46), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 54), svdup_u64((1ULL << 10) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 496); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 47), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 512); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 10) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 48), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 10), svdup_u64((1ULL << 42) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 49), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 528); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 30) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 50), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 30), svdup_u64((1ULL << 34) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 544); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,34), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 51), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 42) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 52), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 50), svdup_u64((1ULL << 14) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 560); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 28) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 53), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 36) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 576); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 6) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 54), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 6), svdup_u64((1ULL << 42) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 55), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 592); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 26) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 56), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 26), svdup_u64((1ULL << 38) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 608); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,38), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 57), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 42) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 58), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 46), svdup_u64((1ULL << 18) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 624); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 59), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 40) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 640); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 2) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 60), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 2), svdup_u64((1ULL << 42) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 61), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 20) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 656); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 22) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 62), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 22), svdup_u64((1ULL << 42) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 63), tmp_dbl); + i += svcntd(); + pg = svwhilelt_b64(i, (1024LL / 64)); + } + while (svptest_any(svptrue_b64(), pg)); + } + static void falp_43bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] svuint64_t register_0; + [[maybe_unused]] svuint64_t tmp_0; + svbool_t pg = svwhilelt_b64(static_cast(0LL), (1024LL / 64)); + int64_t i = 0; + [[maybe_unused]] svuint64_t base_0; + do + { + register_0 = svld1(pg, in + (0 * 16) + (i) + 0); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 43) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 0), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 43), svdup_u64((1ULL << 21) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 16); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 22) - 1)) ,21), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 1), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 22), svdup_u64((1ULL << 42) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 32); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 1) - 1)) ,42), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 2), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 1), svdup_u64((1ULL << 43) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 3), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 20) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 48); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 23) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 4), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 23), svdup_u64((1ULL << 41) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 64); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 2) - 1)) ,41), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 5), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 2), svdup_u64((1ULL << 43) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 6), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 45), svdup_u64((1ULL << 19) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 80); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,19), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 7), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 40) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 96); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 3) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 8), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 3), svdup_u64((1ULL << 43) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 9), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 46), svdup_u64((1ULL << 18) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 112); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 25) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 10), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 25), svdup_u64((1ULL << 39) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 128); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,39), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 11), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 43) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 12), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 47), svdup_u64((1ULL << 17) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 144); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 26) - 1)) ,17), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 13), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 26), svdup_u64((1ULL << 38) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 160); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 5) - 1)) ,38), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 14), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 5), svdup_u64((1ULL << 43) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 15), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 176); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 27) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 16), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 27), svdup_u64((1ULL << 37) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 192); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 6) - 1)) ,37), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 17), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 6), svdup_u64((1ULL << 43) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 18), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 49), svdup_u64((1ULL << 15) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 208); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 28) - 1)) ,15), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 19), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 36) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 224); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 7) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 20), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 7), svdup_u64((1ULL << 43) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 21), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 50), svdup_u64((1ULL << 14) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 240); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 29) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 22), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 29), svdup_u64((1ULL << 35) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 256); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,35), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 23), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 43) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 24), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 51), svdup_u64((1ULL << 13) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 272); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 30) - 1)) ,13), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 25), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 30), svdup_u64((1ULL << 34) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 288); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 9) - 1)) ,34), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 26), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 9), svdup_u64((1ULL << 43) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 27), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 304); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 31) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 28), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 31), svdup_u64((1ULL << 33) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 320); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 10) - 1)) ,33), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 29), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 10), svdup_u64((1ULL << 43) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 30), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 53), svdup_u64((1ULL << 11) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 336); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,11), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 31), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 352); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 11) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 32), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 11), svdup_u64((1ULL << 43) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 33), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 54), svdup_u64((1ULL << 10) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 368); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 33) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 34), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 33), svdup_u64((1ULL << 31) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 384); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,31), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 35), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 43) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 36), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 55), svdup_u64((1ULL << 9) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 400); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 34) - 1)) ,9), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 37), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 34), svdup_u64((1ULL << 30) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 416); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 13) - 1)) ,30), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 38), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 13), svdup_u64((1ULL << 43) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 39), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 432); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 35) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 40), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 35), svdup_u64((1ULL << 29) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 448); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 14) - 1)) ,29), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 41), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 14), svdup_u64((1ULL << 43) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 42), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 57), svdup_u64((1ULL << 7) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 464); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 36) - 1)) ,7), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 43), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 28) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 480); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 15) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 44), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 15), svdup_u64((1ULL << 43) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 45), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 58), svdup_u64((1ULL << 6) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 496); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 37) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 46), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 37), svdup_u64((1ULL << 27) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 512); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,27), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 47), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 43) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 48), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 59), svdup_u64((1ULL << 5) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 528); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 38) - 1)) ,5), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 49), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 38), svdup_u64((1ULL << 26) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 544); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 17) - 1)) ,26), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 50), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 17), svdup_u64((1ULL << 43) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 51), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 560); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 39) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 52), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 39), svdup_u64((1ULL << 25) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 576); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 18) - 1)) ,25), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 53), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 18), svdup_u64((1ULL << 43) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 54), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 61), svdup_u64((1ULL << 3) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 592); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 40) - 1)) ,3), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 55), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 608); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 19) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 56), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 19), svdup_u64((1ULL << 43) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 57), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 62), svdup_u64((1ULL << 2) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 624); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 41) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 58), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 41), svdup_u64((1ULL << 23) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 640); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 20) - 1)) ,23), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 59), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 43) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 60), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 63), svdup_u64((1ULL << 1) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 656); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 42) - 1)) ,1), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 61), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 42), svdup_u64((1ULL << 22) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 672); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 21) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 62), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 21), svdup_u64((1ULL << 43) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 63), tmp_dbl); + i += svcntd(); + pg = svwhilelt_b64(i, (1024LL / 64)); + } + while (svptest_any(svptrue_b64(), pg)); + } + static void falp_44bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] svuint64_t register_0; + [[maybe_unused]] svuint64_t tmp_0; + svbool_t pg = svwhilelt_b64(static_cast(0LL), (1024LL / 64)); + int64_t i = 0; + [[maybe_unused]] svuint64_t base_0; + do + { + register_0 = svld1(pg, in + (0 * 16) + (i) + 0); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 44) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 0), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 20) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 16); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 1), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 40) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 32); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 2), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 44) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 3), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 48); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 28) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 4), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 36) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 64); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 5), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 44) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 6), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 80); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 7), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 96); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 8), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 44) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 9), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 112); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 36) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 10), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 28) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 128); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 11), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 44) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 12), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 144); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 40) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 13), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 160); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 20) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 14), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 44) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 15), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 176); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 44) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 16), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 20) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 192); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 17), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 40) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 208); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 18), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 44) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 19), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 224); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 28) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 20), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 36) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 240); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 21), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 44) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 22), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 256); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 23), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 272); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 24), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 44) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 25), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 288); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 36) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 26), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 28) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 304); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 27), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 44) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 28), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 320); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 40) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 29), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 336); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 20) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 30), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 44) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 31), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 352); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 44) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 32), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 20) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 368); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 33), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 40) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 384); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 34), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 44) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 35), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 400); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 28) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 36), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 36) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 416); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 37), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 44) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 38), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 432); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 39), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 448); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 40), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 44) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 41), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 464); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 36) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 42), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 28) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 480); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 43), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 44) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 44), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 496); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 40) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 45), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 512); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 20) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 46), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 44) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 47), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 528); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 44) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 48), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 20) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 544); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 49), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 40) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 560); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 50), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 44) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 51), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 576); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 28) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 52), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 36) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 592); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 53), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 44) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 54), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 608); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 55), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 624); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 56), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 44) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 57), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 640); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 36) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 58), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 28) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 656); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 59), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 44) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 60), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 672); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 40) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 61), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 688); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 20) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 62), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 44) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 63), tmp_dbl); + i += svcntd(); + pg = svwhilelt_b64(i, (1024LL / 64)); + } + while (svptest_any(svptrue_b64(), pg)); + } + static void falp_45bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] svuint64_t register_0; + [[maybe_unused]] svuint64_t tmp_0; + svbool_t pg = svwhilelt_b64(static_cast(0LL), (1024LL / 64)); + int64_t i = 0; + [[maybe_unused]] svuint64_t base_0; + do + { + register_0 = svld1(pg, in + (0 * 16) + (i) + 0); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 45) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 0), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 45), svdup_u64((1ULL << 19) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 16); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 26) - 1)) ,19), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 1), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 26), svdup_u64((1ULL << 38) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 32); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 7) - 1)) ,38), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 2), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 7), svdup_u64((1ULL << 45) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 3), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 48); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 33) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 4), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 33), svdup_u64((1ULL << 31) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 64); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 14) - 1)) ,31), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 5), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 14), svdup_u64((1ULL << 45) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 6), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 59), svdup_u64((1ULL << 5) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 80); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 40) - 1)) ,5), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 7), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 96); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 21) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 8), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 21), svdup_u64((1ULL << 43) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 112); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 2) - 1)) ,43), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 9), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 2), svdup_u64((1ULL << 45) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 10), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 47), svdup_u64((1ULL << 17) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 128); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 28) - 1)) ,17), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 11), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 36) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 144); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 9) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 12), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 9), svdup_u64((1ULL << 45) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 13), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 54), svdup_u64((1ULL << 10) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 160); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 35) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 14), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 35), svdup_u64((1ULL << 29) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 176); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,29), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 15), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 45) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 16), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 61), svdup_u64((1ULL << 3) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 192); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 42) - 1)) ,3), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 17), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 42), svdup_u64((1ULL << 22) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 208); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 23) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 18), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 23), svdup_u64((1ULL << 41) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 224); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,41), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 19), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 45) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 20), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 49), svdup_u64((1ULL << 15) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 240); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 30) - 1)) ,15), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 21), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 30), svdup_u64((1ULL << 34) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 256); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 11) - 1)) ,34), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 22), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 11), svdup_u64((1ULL << 45) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 23), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 272); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 37) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 24), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 37), svdup_u64((1ULL << 27) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 288); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 18) - 1)) ,27), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 25), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 18), svdup_u64((1ULL << 45) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 26), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 63), svdup_u64((1ULL << 1) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 304); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 44) - 1)) ,1), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 27), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 20) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 320); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 25) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 28), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 25), svdup_u64((1ULL << 39) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 336); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 6) - 1)) ,39), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 29), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 6), svdup_u64((1ULL << 45) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 30), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 51), svdup_u64((1ULL << 13) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 352); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,13), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 31), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 368); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 13) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 32), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 13), svdup_u64((1ULL << 45) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 33), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 58), svdup_u64((1ULL << 6) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 384); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 39) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 34), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 39), svdup_u64((1ULL << 25) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 400); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 20) - 1)) ,25), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 35), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 44) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 416); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 1) - 1)) ,44), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 36), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 1), svdup_u64((1ULL << 45) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 37), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 46), svdup_u64((1ULL << 18) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 432); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 27) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 38), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 27), svdup_u64((1ULL << 37) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 448); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,37), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 39), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 45) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 40), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 53), svdup_u64((1ULL << 11) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 464); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 34) - 1)) ,11), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 41), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 34), svdup_u64((1ULL << 30) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 480); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 15) - 1)) ,30), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 42), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 15), svdup_u64((1ULL << 45) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 43), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 496); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 41) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 44), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 41), svdup_u64((1ULL << 23) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 512); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 22) - 1)) ,23), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 45), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 22), svdup_u64((1ULL << 42) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 528); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 3) - 1)) ,42), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 46), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 3), svdup_u64((1ULL << 45) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 47), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 544); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 29) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 48), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 29), svdup_u64((1ULL << 35) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 560); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 10) - 1)) ,35), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 49), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 10), svdup_u64((1ULL << 45) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 50), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 55), svdup_u64((1ULL << 9) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 576); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 36) - 1)) ,9), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 51), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 28) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 592); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 17) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 52), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 17), svdup_u64((1ULL << 45) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 53), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 62), svdup_u64((1ULL << 2) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 608); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 43) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 54), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 43), svdup_u64((1ULL << 21) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 624); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,21), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 55), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 40) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 640); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 5) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 56), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 5), svdup_u64((1ULL << 45) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 57), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 50), svdup_u64((1ULL << 14) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 656); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 31) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 58), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 31), svdup_u64((1ULL << 33) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 672); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,33), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 59), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 45) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 60), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 57), svdup_u64((1ULL << 7) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 688); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 38) - 1)) ,7), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 61), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 38), svdup_u64((1ULL << 26) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 704); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 19) - 1)) ,26), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 62), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 19), svdup_u64((1ULL << 45) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 63), tmp_dbl); + i += svcntd(); + pg = svwhilelt_b64(i, (1024LL / 64)); + } + while (svptest_any(svptrue_b64(), pg)); + } + static void falp_46bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] svuint64_t register_0; + [[maybe_unused]] svuint64_t tmp_0; + svbool_t pg = svwhilelt_b64(static_cast(0LL), (1024LL / 64)); + int64_t i = 0; + [[maybe_unused]] svuint64_t base_0; + do + { + register_0 = svld1(pg, in + (0 * 16) + (i) + 0); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 46) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 0), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 46), svdup_u64((1ULL << 18) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 16); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 28) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 1), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 36) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 32); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 10) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 2), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 10), svdup_u64((1ULL << 46) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 3), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 48); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 38) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 4), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 38), svdup_u64((1ULL << 26) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 64); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 20) - 1)) ,26), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 5), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 44) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 80); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 2) - 1)) ,44), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 6), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 2), svdup_u64((1ULL << 46) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 7), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 96); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 30) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 8), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 30), svdup_u64((1ULL << 34) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 112); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,34), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 9), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 46) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 10), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 58), svdup_u64((1ULL << 6) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 128); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 40) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 11), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 144); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 22) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 12), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 22), svdup_u64((1ULL << 42) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 160); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,42), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 13), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 46) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 14), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 50), svdup_u64((1ULL << 14) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 176); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 15), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 192); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 14) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 16), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 14), svdup_u64((1ULL << 46) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 17), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 208); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 42) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 18), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 42), svdup_u64((1ULL << 22) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 224); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 19), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 40) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 240); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 6) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 20), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 6), svdup_u64((1ULL << 46) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 21), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 256); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 34) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 22), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 34), svdup_u64((1ULL << 30) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 272); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,30), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 23), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 46) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 24), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 62), svdup_u64((1ULL << 2) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 288); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 44) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 25), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 20) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 304); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 26) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 26), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 26), svdup_u64((1ULL << 38) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 320); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,38), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 27), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 46) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 28), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 54), svdup_u64((1ULL << 10) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 336); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 36) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 29), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 28) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 352); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 18) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 30), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 18), svdup_u64((1ULL << 46) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 31), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 368); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 46) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 32), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 46), svdup_u64((1ULL << 18) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 384); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 28) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 33), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 36) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 400); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 10) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 34), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 10), svdup_u64((1ULL << 46) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 35), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 416); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 38) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 36), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 38), svdup_u64((1ULL << 26) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 432); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 20) - 1)) ,26), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 37), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 44) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 448); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 2) - 1)) ,44), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 38), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 2), svdup_u64((1ULL << 46) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 39), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 464); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 30) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 40), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 30), svdup_u64((1ULL << 34) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 480); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,34), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 41), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 46) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 42), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 58), svdup_u64((1ULL << 6) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 496); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 40) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 43), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 512); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 22) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 44), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 22), svdup_u64((1ULL << 42) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 528); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,42), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 45), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 46) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 46), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 50), svdup_u64((1ULL << 14) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 544); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 47), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 560); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 14) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 48), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 14), svdup_u64((1ULL << 46) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 49), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 576); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 42) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 50), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 42), svdup_u64((1ULL << 22) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 592); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 51), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 40) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 608); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 6) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 52), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 6), svdup_u64((1ULL << 46) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 53), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 624); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 34) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 54), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 34), svdup_u64((1ULL << 30) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 640); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,30), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 55), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 46) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 56), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 62), svdup_u64((1ULL << 2) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 656); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 44) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 57), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 20) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 672); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 26) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 58), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 26), svdup_u64((1ULL << 38) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 688); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,38), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 59), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 46) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 60), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 54), svdup_u64((1ULL << 10) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 704); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 36) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 61), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 28) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 720); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 18) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 62), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 18), svdup_u64((1ULL << 46) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 63), tmp_dbl); + i += svcntd(); + pg = svwhilelt_b64(i, (1024LL / 64)); + } + while (svptest_any(svptrue_b64(), pg)); + } + static void falp_47bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] svuint64_t register_0; + [[maybe_unused]] svuint64_t tmp_0; + svbool_t pg = svwhilelt_b64(static_cast(0LL), (1024LL / 64)); + int64_t i = 0; + [[maybe_unused]] svuint64_t base_0; + do + { + register_0 = svld1(pg, in + (0 * 16) + (i) + 0); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 47) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 0), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 47), svdup_u64((1ULL << 17) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 16); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 30) - 1)) ,17), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 1), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 30), svdup_u64((1ULL << 34) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 32); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 13) - 1)) ,34), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 2), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 13), svdup_u64((1ULL << 47) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 3), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 48); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 43) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 4), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 43), svdup_u64((1ULL << 21) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 64); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 26) - 1)) ,21), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 5), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 26), svdup_u64((1ULL << 38) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 80); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 9) - 1)) ,38), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 6), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 9), svdup_u64((1ULL << 47) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 7), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 96); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 39) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 8), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 39), svdup_u64((1ULL << 25) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 112); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 22) - 1)) ,25), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 9), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 22), svdup_u64((1ULL << 42) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 128); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 5) - 1)) ,42), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 10), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 5), svdup_u64((1ULL << 47) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 11), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 144); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 35) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 12), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 35), svdup_u64((1ULL << 29) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 160); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 18) - 1)) ,29), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 13), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 18), svdup_u64((1ULL << 46) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 176); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 1) - 1)) ,46), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 14), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 1), svdup_u64((1ULL << 47) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 15), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 192); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 31) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 16), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 31), svdup_u64((1ULL << 33) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 208); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 14) - 1)) ,33), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 17), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 14), svdup_u64((1ULL << 47) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 18), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 61), svdup_u64((1ULL << 3) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 224); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 44) - 1)) ,3), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 19), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 20) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 240); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 27) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 20), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 27), svdup_u64((1ULL << 37) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 256); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 10) - 1)) ,37), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 21), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 10), svdup_u64((1ULL << 47) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 22), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 57), svdup_u64((1ULL << 7) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 272); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 40) - 1)) ,7), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 23), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 288); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 23) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 24), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 23), svdup_u64((1ULL << 41) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 304); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 6) - 1)) ,41), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 25), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 6), svdup_u64((1ULL << 47) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 26), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 53), svdup_u64((1ULL << 11) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 320); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 36) - 1)) ,11), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 27), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 28) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 336); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 19) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 28), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 19), svdup_u64((1ULL << 45) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 352); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 2) - 1)) ,45), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 29), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 2), svdup_u64((1ULL << 47) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 30), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 49), svdup_u64((1ULL << 15) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 368); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,15), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 31), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 384); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 15) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 32), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 15), svdup_u64((1ULL << 47) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 33), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 62), svdup_u64((1ULL << 2) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 400); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 45) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 34), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 45), svdup_u64((1ULL << 19) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 416); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 28) - 1)) ,19), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 35), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 36) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 432); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 11) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 36), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 11), svdup_u64((1ULL << 47) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 37), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 58), svdup_u64((1ULL << 6) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 448); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 41) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 38), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 41), svdup_u64((1ULL << 23) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 464); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,23), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 39), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 40) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 480); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 7) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 40), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 7), svdup_u64((1ULL << 47) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 41), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 54), svdup_u64((1ULL << 10) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 496); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 37) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 42), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 37), svdup_u64((1ULL << 27) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 512); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 20) - 1)) ,27), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 43), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 44) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 528); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 3) - 1)) ,44), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 44), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 3), svdup_u64((1ULL << 47) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 45), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 50), svdup_u64((1ULL << 14) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 544); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 33) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 46), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 33), svdup_u64((1ULL << 31) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 560); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,31), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 47), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 47) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 48), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 63), svdup_u64((1ULL << 1) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 576); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 46) - 1)) ,1), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 49), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 46), svdup_u64((1ULL << 18) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 592); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 29) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 50), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 29), svdup_u64((1ULL << 35) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 608); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,35), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 51), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 47) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 52), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 59), svdup_u64((1ULL << 5) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 624); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 42) - 1)) ,5), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 53), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 42), svdup_u64((1ULL << 22) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 640); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 25) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 54), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 25), svdup_u64((1ULL << 39) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 656); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,39), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 55), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 47) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 56), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 55), svdup_u64((1ULL << 9) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 672); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 38) - 1)) ,9), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 57), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 38), svdup_u64((1ULL << 26) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 688); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 21) - 1)) ,26), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 58), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 21), svdup_u64((1ULL << 43) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 704); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,43), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 59), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 47) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 60), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 51), svdup_u64((1ULL << 13) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 720); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 34) - 1)) ,13), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 61), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 34), svdup_u64((1ULL << 30) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 736); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 17) - 1)) ,30), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 62), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 17), svdup_u64((1ULL << 47) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 63), tmp_dbl); + i += svcntd(); + pg = svwhilelt_b64(i, (1024LL / 64)); + } + while (svptest_any(svptrue_b64(), pg)); + } + static void falp_48bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] svuint64_t register_0; + [[maybe_unused]] svuint64_t tmp_0; + svbool_t pg = svwhilelt_b64(static_cast(0LL), (1024LL / 64)); + int64_t i = 0; + [[maybe_unused]] svuint64_t base_0; + do + { + register_0 = svld1(pg, in + (0 * 16) + (i) + 0); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 48) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 0), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 16); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 1), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 32); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 2), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 48) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 3), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 48); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 48) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 4), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 64); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 5), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 80); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 6), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 48) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 7), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 96); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 48) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 8), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 112); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 9), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 128); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 10), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 48) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 11), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 144); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 48) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 12), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 160); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 13), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 176); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 14), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 48) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 15), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 192); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 48) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 16), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 208); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 17), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 224); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 18), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 48) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 19), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 240); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 48) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 20), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 256); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 21), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 272); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 22), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 48) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 23), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 288); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 48) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 24), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 304); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 25), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 320); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 26), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 48) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 27), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 336); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 48) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 28), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 352); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 29), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 368); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 30), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 48) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 31), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 384); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 48) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 32), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 400); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 33), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 416); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 34), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 48) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 35), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 432); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 48) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 36), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 448); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 37), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 464); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 38), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 48) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 39), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 480); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 48) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 40), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 496); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 41), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 512); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 42), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 48) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 43), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 528); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 48) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 44), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 544); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 45), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 560); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 46), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 48) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 47), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 576); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 48) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 48), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 592); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 49), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 608); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 50), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 48) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 51), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 624); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 48) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 52), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 640); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 53), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 656); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 54), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 48) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 55), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 672); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 48) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 56), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 688); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 57), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 704); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 58), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 48) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 59), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 720); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 48) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 60), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 736); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 61), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 752); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 62), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 48) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 63), tmp_dbl); + i += svcntd(); + pg = svwhilelt_b64(i, (1024LL / 64)); + } + while (svptest_any(svptrue_b64(), pg)); + } + static void falp_49bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] svuint64_t register_0; + [[maybe_unused]] svuint64_t tmp_0; + svbool_t pg = svwhilelt_b64(static_cast(0LL), (1024LL / 64)); + int64_t i = 0; + [[maybe_unused]] svuint64_t base_0; + do + { + register_0 = svld1(pg, in + (0 * 16) + (i) + 0); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 49) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 0), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 49), svdup_u64((1ULL << 15) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 16); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 34) - 1)) ,15), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 1), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 34), svdup_u64((1ULL << 30) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 32); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 19) - 1)) ,30), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 2), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 19), svdup_u64((1ULL << 45) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 48); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,45), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 3), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 49) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 4), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 53), svdup_u64((1ULL << 11) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 64); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 38) - 1)) ,11), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 5), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 38), svdup_u64((1ULL << 26) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 80); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 23) - 1)) ,26), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 6), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 23), svdup_u64((1ULL << 41) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 96); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,41), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 7), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 49) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 8), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 57), svdup_u64((1ULL << 7) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 112); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 42) - 1)) ,7), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 9), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 42), svdup_u64((1ULL << 22) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 128); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 27) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 10), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 27), svdup_u64((1ULL << 37) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 144); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,37), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 11), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 49) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 12), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 61), svdup_u64((1ULL << 3) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 160); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 46) - 1)) ,3), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 13), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 46), svdup_u64((1ULL << 18) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 176); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 31) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 14), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 31), svdup_u64((1ULL << 33) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 192); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,33), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 15), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 48) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 208); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 1) - 1)) ,48), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 16), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 1), svdup_u64((1ULL << 49) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 17), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 50), svdup_u64((1ULL << 14) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 224); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 35) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 18), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 35), svdup_u64((1ULL << 29) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 240); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 20) - 1)) ,29), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 19), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 44) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 256); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 5) - 1)) ,44), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 20), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 5), svdup_u64((1ULL << 49) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 21), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 54), svdup_u64((1ULL << 10) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 272); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 39) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 22), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 39), svdup_u64((1ULL << 25) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 288); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,25), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 23), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 40) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 304); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 9) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 24), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 9), svdup_u64((1ULL << 49) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 25), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 58), svdup_u64((1ULL << 6) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 320); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 43) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 26), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 43), svdup_u64((1ULL << 21) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 336); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 28) - 1)) ,21), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 27), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 36) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 352); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 13) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 28), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 13), svdup_u64((1ULL << 49) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 29), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 62), svdup_u64((1ULL << 2) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 368); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 47) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 30), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 47), svdup_u64((1ULL << 17) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 384); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,17), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 31), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 400); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 17) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 32), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 17), svdup_u64((1ULL << 47) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 416); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 2) - 1)) ,47), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 33), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 2), svdup_u64((1ULL << 49) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 34), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 51), svdup_u64((1ULL << 13) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 432); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 36) - 1)) ,13), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 35), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 28) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 448); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 21) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 36), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 21), svdup_u64((1ULL << 43) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 464); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 6) - 1)) ,43), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 37), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 6), svdup_u64((1ULL << 49) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 38), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 55), svdup_u64((1ULL << 9) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 480); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 40) - 1)) ,9), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 39), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 496); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 25) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 40), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 25), svdup_u64((1ULL << 39) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 512); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 10) - 1)) ,39), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 41), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 10), svdup_u64((1ULL << 49) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 42), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 59), svdup_u64((1ULL << 5) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 528); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 44) - 1)) ,5), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 43), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 20) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 544); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 29) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 44), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 29), svdup_u64((1ULL << 35) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 560); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 14) - 1)) ,35), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 45), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 14), svdup_u64((1ULL << 49) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 46), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 63), svdup_u64((1ULL << 1) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 576); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 48) - 1)) ,1), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 47), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 592); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 33) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 48), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 33), svdup_u64((1ULL << 31) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 608); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 18) - 1)) ,31), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 49), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 18), svdup_u64((1ULL << 46) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 624); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 3) - 1)) ,46), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 50), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 3), svdup_u64((1ULL << 49) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 51), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 640); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 37) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 52), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 37), svdup_u64((1ULL << 27) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 656); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 22) - 1)) ,27), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 53), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 22), svdup_u64((1ULL << 42) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 672); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 7) - 1)) ,42), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 54), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 7), svdup_u64((1ULL << 49) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 55), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 688); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 41) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 56), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 41), svdup_u64((1ULL << 23) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 704); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 26) - 1)) ,23), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 57), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 26), svdup_u64((1ULL << 38) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 720); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 11) - 1)) ,38), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 58), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 11), svdup_u64((1ULL << 49) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 59), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 736); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 45) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 60), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 45), svdup_u64((1ULL << 19) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 752); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 30) - 1)) ,19), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 61), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 30), svdup_u64((1ULL << 34) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 768); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 15) - 1)) ,34), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 62), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 15), svdup_u64((1ULL << 49) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 63), tmp_dbl); + i += svcntd(); + pg = svwhilelt_b64(i, (1024LL / 64)); + } + while (svptest_any(svptrue_b64(), pg)); + } + static void falp_50bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] svuint64_t register_0; + [[maybe_unused]] svuint64_t tmp_0; + svbool_t pg = svwhilelt_b64(static_cast(0LL), (1024LL / 64)); + int64_t i = 0; + [[maybe_unused]] svuint64_t base_0; + do + { + register_0 = svld1(pg, in + (0 * 16) + (i) + 0); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 50) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 0), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 50), svdup_u64((1ULL << 14) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 16); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 36) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 1), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 28) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 32); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 22) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 2), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 22), svdup_u64((1ULL << 42) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 48); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,42), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 3), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 50) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 4), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 58), svdup_u64((1ULL << 6) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 64); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 44) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 5), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 20) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 80); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 30) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 6), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 30), svdup_u64((1ULL << 34) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 96); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,34), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 7), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 48) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 112); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 2) - 1)) ,48), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 8), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 2), svdup_u64((1ULL << 50) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 9), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 128); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 38) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 10), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 38), svdup_u64((1ULL << 26) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 144); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,26), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 11), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 40) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 160); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 10) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 12), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 10), svdup_u64((1ULL << 50) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 13), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 176); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 46) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 14), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 46), svdup_u64((1ULL << 18) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 192); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 15), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 208); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 18) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 16), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 18), svdup_u64((1ULL << 46) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 224); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,46), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 17), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 50) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 18), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 54), svdup_u64((1ULL << 10) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 240); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 40) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 19), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 256); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 26) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 20), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 26), svdup_u64((1ULL << 38) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 272); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,38), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 21), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 50) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 22), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 62), svdup_u64((1ULL << 2) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 288); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 48) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 23), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 304); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 34) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 24), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 34), svdup_u64((1ULL << 30) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 320); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 20) - 1)) ,30), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 25), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 44) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 336); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 6) - 1)) ,44), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 26), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 6), svdup_u64((1ULL << 50) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 27), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 352); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 42) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 28), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 42), svdup_u64((1ULL << 22) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 368); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 28) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 29), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 36) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 384); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 14) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 30), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 14), svdup_u64((1ULL << 50) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 31), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 400); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 50) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 32), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 50), svdup_u64((1ULL << 14) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 416); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 36) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 33), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 28) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 432); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 22) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 34), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 22), svdup_u64((1ULL << 42) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 448); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,42), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 35), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 50) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 36), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 58), svdup_u64((1ULL << 6) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 464); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 44) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 37), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 20) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 480); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 30) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 38), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 30), svdup_u64((1ULL << 34) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 496); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,34), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 39), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 48) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 512); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 2) - 1)) ,48), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 40), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 2), svdup_u64((1ULL << 50) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 41), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 528); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 38) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 42), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 38), svdup_u64((1ULL << 26) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 544); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,26), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 43), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 40) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 560); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 10) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 44), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 10), svdup_u64((1ULL << 50) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 45), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 576); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 46) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 46), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 46), svdup_u64((1ULL << 18) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 592); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 47), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 608); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 18) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 48), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 18), svdup_u64((1ULL << 46) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 624); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,46), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 49), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 50) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 50), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 54), svdup_u64((1ULL << 10) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 640); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 40) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 51), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 656); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 26) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 52), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 26), svdup_u64((1ULL << 38) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 672); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,38), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 53), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 50) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 54), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 62), svdup_u64((1ULL << 2) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 688); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 48) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 55), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 704); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 34) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 56), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 34), svdup_u64((1ULL << 30) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 720); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 20) - 1)) ,30), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 57), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 44) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 736); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 6) - 1)) ,44), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 58), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 6), svdup_u64((1ULL << 50) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 59), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 752); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 42) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 60), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 42), svdup_u64((1ULL << 22) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 768); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 28) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 61), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 36) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 784); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 14) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 62), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 14), svdup_u64((1ULL << 50) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 63), tmp_dbl); + i += svcntd(); + pg = svwhilelt_b64(i, (1024LL / 64)); + } + while (svptest_any(svptrue_b64(), pg)); + } + static void falp_51bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] svuint64_t register_0; + [[maybe_unused]] svuint64_t tmp_0; + svbool_t pg = svwhilelt_b64(static_cast(0LL), (1024LL / 64)); + int64_t i = 0; + [[maybe_unused]] svuint64_t base_0; + do + { + register_0 = svld1(pg, in + (0 * 16) + (i) + 0); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 51) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 0), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 51), svdup_u64((1ULL << 13) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 16); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 38) - 1)) ,13), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 1), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 38), svdup_u64((1ULL << 26) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 32); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 25) - 1)) ,26), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 2), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 25), svdup_u64((1ULL << 39) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 48); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,39), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 3), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 51) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 4), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 63), svdup_u64((1ULL << 1) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 64); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 50) - 1)) ,1), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 5), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 50), svdup_u64((1ULL << 14) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 80); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 37) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 6), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 37), svdup_u64((1ULL << 27) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 96); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,27), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 7), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 40) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 112); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 11) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 8), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 11), svdup_u64((1ULL << 51) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 9), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 62), svdup_u64((1ULL << 2) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 128); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 49) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 10), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 49), svdup_u64((1ULL << 15) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 144); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 36) - 1)) ,15), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 11), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 28) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 160); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 23) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 12), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 23), svdup_u64((1ULL << 41) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 176); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 10) - 1)) ,41), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 13), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 10), svdup_u64((1ULL << 51) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 14), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 61), svdup_u64((1ULL << 3) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 192); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 48) - 1)) ,3), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 15), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 208); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 35) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 16), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 35), svdup_u64((1ULL << 29) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 224); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 22) - 1)) ,29), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 17), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 22), svdup_u64((1ULL << 42) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 240); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 9) - 1)) ,42), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 18), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 9), svdup_u64((1ULL << 51) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 19), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 256); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 47) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 20), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 47), svdup_u64((1ULL << 17) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 272); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 34) - 1)) ,17), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 21), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 34), svdup_u64((1ULL << 30) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 288); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 21) - 1)) ,30), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 22), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 21), svdup_u64((1ULL << 43) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 304); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,43), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 23), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 51) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 24), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 59), svdup_u64((1ULL << 5) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 320); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 46) - 1)) ,5), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 25), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 46), svdup_u64((1ULL << 18) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 336); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 33) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 26), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 33), svdup_u64((1ULL << 31) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 352); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 20) - 1)) ,31), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 27), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 44) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 368); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 7) - 1)) ,44), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 28), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 7), svdup_u64((1ULL << 51) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 29), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 58), svdup_u64((1ULL << 6) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 384); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 45) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 30), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 45), svdup_u64((1ULL << 19) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 400); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,19), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 31), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 416); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 19) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 32), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 19), svdup_u64((1ULL << 45) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 432); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 6) - 1)) ,45), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 33), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 6), svdup_u64((1ULL << 51) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 34), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 57), svdup_u64((1ULL << 7) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 448); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 44) - 1)) ,7), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 35), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 20) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 464); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 31) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 36), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 31), svdup_u64((1ULL << 33) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 480); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 18) - 1)) ,33), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 37), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 18), svdup_u64((1ULL << 46) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 496); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 5) - 1)) ,46), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 38), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 5), svdup_u64((1ULL << 51) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 39), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 512); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 43) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 40), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 43), svdup_u64((1ULL << 21) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 528); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 30) - 1)) ,21), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 41), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 30), svdup_u64((1ULL << 34) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 544); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 17) - 1)) ,34), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 42), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 17), svdup_u64((1ULL << 47) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 560); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,47), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 43), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 51) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 44), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 55), svdup_u64((1ULL << 9) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 576); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 42) - 1)) ,9), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 45), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 42), svdup_u64((1ULL << 22) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 592); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 29) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 46), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 29), svdup_u64((1ULL << 35) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 608); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,35), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 47), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 48) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 624); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 3) - 1)) ,48), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 48), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 3), svdup_u64((1ULL << 51) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 49), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 54), svdup_u64((1ULL << 10) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 640); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 41) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 50), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 41), svdup_u64((1ULL << 23) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 656); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 28) - 1)) ,23), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 51), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 36) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 672); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 15) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 52), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 15), svdup_u64((1ULL << 49) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 688); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 2) - 1)) ,49), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 53), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 2), svdup_u64((1ULL << 51) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 54), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 53), svdup_u64((1ULL << 11) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 704); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 40) - 1)) ,11), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 55), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 720); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 27) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 56), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 27), svdup_u64((1ULL << 37) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 736); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 14) - 1)) ,37), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 57), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 14), svdup_u64((1ULL << 50) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 752); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 1) - 1)) ,50), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 58), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 1), svdup_u64((1ULL << 51) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 59), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 768); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 39) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 60), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 39), svdup_u64((1ULL << 25) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 784); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 26) - 1)) ,25), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 61), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 26), svdup_u64((1ULL << 38) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 800); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 13) - 1)) ,38), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 62), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 13), svdup_u64((1ULL << 51) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 63), tmp_dbl); + i += svcntd(); + pg = svwhilelt_b64(i, (1024LL / 64)); + } + while (svptest_any(svptrue_b64(), pg)); + } + static void falp_52bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] svuint64_t register_0; + [[maybe_unused]] svuint64_t tmp_0; + svbool_t pg = svwhilelt_b64(static_cast(0LL), (1024LL / 64)); + int64_t i = 0; + [[maybe_unused]] svuint64_t base_0; + do + { + register_0 = svld1(pg, in + (0 * 16) + (i) + 0); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 52) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 0), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 16); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 40) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 1), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 32); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 28) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 2), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 36) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 48); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 3), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 48) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 64); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,48), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 4), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 52) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 5), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 80); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 44) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 6), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 20) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 96); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 7), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 112); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 20) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 8), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 44) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 128); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,44), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 9), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 52) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 10), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 144); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 48) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 11), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 160); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 36) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 12), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 28) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 176); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 13), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 40) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 192); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 14), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 52) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 15), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 208); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 52) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 16), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 224); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 40) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 17), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 240); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 28) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 18), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 36) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 256); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 19), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 48) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 272); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,48), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 20), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 52) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 21), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 288); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 44) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 22), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 20) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 304); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 23), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 320); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 20) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 24), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 44) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 336); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,44), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 25), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 52) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 26), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 352); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 48) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 27), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 368); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 36) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 28), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 28) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 384); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 29), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 40) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 400); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 30), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 52) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 31), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 416); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 52) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 32), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 432); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 40) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 33), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 448); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 28) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 34), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 36) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 464); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 35), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 48) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 480); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,48), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 36), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 52) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 37), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 496); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 44) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 38), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 20) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 512); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 39), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 528); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 20) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 40), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 44) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 544); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,44), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 41), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 52) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 42), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 560); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 48) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 43), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 576); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 36) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 44), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 28) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 592); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 45), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 40) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 608); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 46), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 52) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 47), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 624); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 52) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 48), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 640); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 40) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 49), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 656); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 28) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 50), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 36) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 672); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 51), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 48) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 688); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,48), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 52), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 52) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 53), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 704); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 44) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 54), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 20) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 720); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 55), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 736); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 20) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 56), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 44) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 752); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,44), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 57), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 52) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 58), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 768); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 48) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 59), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 784); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 36) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 60), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 28) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 800); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 61), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 40) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 816); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 62), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 52) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 63), tmp_dbl); + i += svcntd(); + pg = svwhilelt_b64(i, (1024LL / 64)); + } + while (svptest_any(svptrue_b64(), pg)); + } + static void falp_53bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] svuint64_t register_0; + [[maybe_unused]] svuint64_t tmp_0; + svbool_t pg = svwhilelt_b64(static_cast(0LL), (1024LL / 64)); + int64_t i = 0; + [[maybe_unused]] svuint64_t base_0; + do + { + register_0 = svld1(pg, in + (0 * 16) + (i) + 0); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 53) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 0), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 53), svdup_u64((1ULL << 11) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 16); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 42) - 1)) ,11), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 1), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 42), svdup_u64((1ULL << 22) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 32); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 31) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 2), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 31), svdup_u64((1ULL << 33) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 48); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 20) - 1)) ,33), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 3), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 44) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 64); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 9) - 1)) ,44), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 4), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 9), svdup_u64((1ULL << 53) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 5), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 62), svdup_u64((1ULL << 2) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 80); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 51) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 6), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 51), svdup_u64((1ULL << 13) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 96); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 40) - 1)) ,13), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 7), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 112); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 29) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 8), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 29), svdup_u64((1ULL << 35) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 128); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 18) - 1)) ,35), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 9), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 18), svdup_u64((1ULL << 46) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 144); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 7) - 1)) ,46), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 10), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 7), svdup_u64((1ULL << 53) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 11), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 160); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 49) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 12), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 49), svdup_u64((1ULL << 15) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 176); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 38) - 1)) ,15), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 13), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 38), svdup_u64((1ULL << 26) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 192); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 27) - 1)) ,26), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 14), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 27), svdup_u64((1ULL << 37) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 208); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,37), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 15), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 48) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 224); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 5) - 1)) ,48), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 16), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 5), svdup_u64((1ULL << 53) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 17), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 58), svdup_u64((1ULL << 6) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 240); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 47) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 18), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 47), svdup_u64((1ULL << 17) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 256); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 36) - 1)) ,17), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 19), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 28) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 272); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 25) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 20), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 25), svdup_u64((1ULL << 39) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 288); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 14) - 1)) ,39), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 21), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 14), svdup_u64((1ULL << 50) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 304); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 3) - 1)) ,50), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 22), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 3), svdup_u64((1ULL << 53) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 23), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 320); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 45) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 24), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 45), svdup_u64((1ULL << 19) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 336); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 34) - 1)) ,19), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 25), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 34), svdup_u64((1ULL << 30) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 352); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 23) - 1)) ,30), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 26), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 23), svdup_u64((1ULL << 41) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 368); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,41), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 27), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 52) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 384); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 1) - 1)) ,52), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 28), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 1), svdup_u64((1ULL << 53) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 29), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 54), svdup_u64((1ULL << 10) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 400); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 43) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 30), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 43), svdup_u64((1ULL << 21) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 416); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,21), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 31), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 432); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 21) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 32), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 21), svdup_u64((1ULL << 43) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 448); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 10) - 1)) ,43), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 33), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 10), svdup_u64((1ULL << 53) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 34), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 63), svdup_u64((1ULL << 1) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 464); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 52) - 1)) ,1), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 35), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 480); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 41) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 36), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 41), svdup_u64((1ULL << 23) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 496); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 30) - 1)) ,23), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 37), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 30), svdup_u64((1ULL << 34) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 512); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 19) - 1)) ,34), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 38), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 19), svdup_u64((1ULL << 45) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 528); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,45), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 39), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 53) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 40), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 61), svdup_u64((1ULL << 3) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 544); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 50) - 1)) ,3), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 41), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 50), svdup_u64((1ULL << 14) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 560); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 39) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 42), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 39), svdup_u64((1ULL << 25) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 576); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 28) - 1)) ,25), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 43), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 36) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 592); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 17) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 44), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 17), svdup_u64((1ULL << 47) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 608); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 6) - 1)) ,47), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 45), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 6), svdup_u64((1ULL << 53) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 46), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 59), svdup_u64((1ULL << 5) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 624); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 48) - 1)) ,5), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 47), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 640); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 37) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 48), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 37), svdup_u64((1ULL << 27) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 656); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 26) - 1)) ,27), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 49), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 26), svdup_u64((1ULL << 38) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 672); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 15) - 1)) ,38), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 50), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 15), svdup_u64((1ULL << 49) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 688); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,49), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 51), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 53) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 52), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 57), svdup_u64((1ULL << 7) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 704); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 46) - 1)) ,7), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 53), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 46), svdup_u64((1ULL << 18) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 720); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 35) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 54), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 35), svdup_u64((1ULL << 29) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 736); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,29), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 55), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 40) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 752); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 13) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 56), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 13), svdup_u64((1ULL << 51) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 768); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 2) - 1)) ,51), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 57), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 2), svdup_u64((1ULL << 53) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 58), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 55), svdup_u64((1ULL << 9) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 784); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 44) - 1)) ,9), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 59), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 20) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 800); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 33) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 60), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 33), svdup_u64((1ULL << 31) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 816); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 22) - 1)) ,31), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 61), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 22), svdup_u64((1ULL << 42) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 832); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 11) - 1)) ,42), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 62), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 11), svdup_u64((1ULL << 53) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 63), tmp_dbl); + i += svcntd(); + pg = svwhilelt_b64(i, (1024LL / 64)); + } + while (svptest_any(svptrue_b64(), pg)); + } + static void falp_54bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] svuint64_t register_0; + [[maybe_unused]] svuint64_t tmp_0; + svbool_t pg = svwhilelt_b64(static_cast(0LL), (1024LL / 64)); + int64_t i = 0; + [[maybe_unused]] svuint64_t base_0; + do + { + register_0 = svld1(pg, in + (0 * 16) + (i) + 0); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 54) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 0), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 54), svdup_u64((1ULL << 10) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 16); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 44) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 1), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 20) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 32); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 34) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 2), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 34), svdup_u64((1ULL << 30) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 48); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,30), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 3), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 40) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 64); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 14) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 4), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 14), svdup_u64((1ULL << 50) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 80); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,50), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 5), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 54) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 6), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 58), svdup_u64((1ULL << 6) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 96); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 48) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 7), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 112); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 38) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 8), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 38), svdup_u64((1ULL << 26) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 128); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 28) - 1)) ,26), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 9), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 36) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 144); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 18) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 10), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 18), svdup_u64((1ULL << 46) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 160); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,46), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 11), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 54) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 12), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 62), svdup_u64((1ULL << 2) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 176); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 52) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 13), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 192); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 42) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 14), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 42), svdup_u64((1ULL << 22) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 208); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 15), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 224); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 22) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 16), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 22), svdup_u64((1ULL << 42) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 240); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,42), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 17), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 52) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 256); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 2) - 1)) ,52), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 18), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 2), svdup_u64((1ULL << 54) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 19), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 272); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 46) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 20), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 46), svdup_u64((1ULL << 18) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 288); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 36) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 21), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 28) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 304); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 26) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 22), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 26), svdup_u64((1ULL << 38) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 320); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,38), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 23), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 48) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 336); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 6) - 1)) ,48), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 24), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 6), svdup_u64((1ULL << 54) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 25), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 352); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 50) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 26), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 50), svdup_u64((1ULL << 14) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 368); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 40) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 27), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 384); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 30) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 28), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 30), svdup_u64((1ULL << 34) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 400); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 20) - 1)) ,34), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 29), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 44) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 416); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 10) - 1)) ,44), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 30), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 10), svdup_u64((1ULL << 54) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 31), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 432); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 54) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 32), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 54), svdup_u64((1ULL << 10) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 448); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 44) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 33), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 20) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 464); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 34) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 34), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 34), svdup_u64((1ULL << 30) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 480); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,30), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 35), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 40) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 496); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 14) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 36), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 14), svdup_u64((1ULL << 50) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 512); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,50), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 37), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 54) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 38), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 58), svdup_u64((1ULL << 6) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 528); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 48) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 39), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 544); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 38) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 40), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 38), svdup_u64((1ULL << 26) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 560); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 28) - 1)) ,26), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 41), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 36) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 576); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 18) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 42), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 18), svdup_u64((1ULL << 46) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 592); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,46), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 43), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 54) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 44), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 62), svdup_u64((1ULL << 2) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 608); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 52) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 45), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 624); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 42) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 46), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 42), svdup_u64((1ULL << 22) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 640); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 47), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 656); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 22) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 48), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 22), svdup_u64((1ULL << 42) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 672); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,42), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 49), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 52) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 688); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 2) - 1)) ,52), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 50), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 2), svdup_u64((1ULL << 54) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 51), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 704); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 46) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 52), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 46), svdup_u64((1ULL << 18) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 720); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 36) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 53), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 28) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 736); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 26) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 54), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 26), svdup_u64((1ULL << 38) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 752); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,38), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 55), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 48) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 768); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 6) - 1)) ,48), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 56), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 6), svdup_u64((1ULL << 54) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 57), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 784); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 50) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 58), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 50), svdup_u64((1ULL << 14) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 800); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 40) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 59), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 816); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 30) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 60), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 30), svdup_u64((1ULL << 34) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 832); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 20) - 1)) ,34), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 61), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 44) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 848); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 10) - 1)) ,44), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 62), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 10), svdup_u64((1ULL << 54) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 63), tmp_dbl); + i += svcntd(); + pg = svwhilelt_b64(i, (1024LL / 64)); + } + while (svptest_any(svptrue_b64(), pg)); + } + static void falp_55bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] svuint64_t register_0; + [[maybe_unused]] svuint64_t tmp_0; + svbool_t pg = svwhilelt_b64(static_cast(0LL), (1024LL / 64)); + int64_t i = 0; + [[maybe_unused]] svuint64_t base_0; + do + { + register_0 = svld1(pg, in + (0 * 16) + (i) + 0); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 55) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 0), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 55), svdup_u64((1ULL << 9) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 16); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 46) - 1)) ,9), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 1), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 46), svdup_u64((1ULL << 18) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 32); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 37) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 2), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 37), svdup_u64((1ULL << 27) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 48); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 28) - 1)) ,27), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 3), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 36) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 64); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 19) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 4), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 19), svdup_u64((1ULL << 45) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 80); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 10) - 1)) ,45), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 5), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 10), svdup_u64((1ULL << 54) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 96); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 1) - 1)) ,54), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 6), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 1), svdup_u64((1ULL << 55) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 7), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 112); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 47) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 8), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 47), svdup_u64((1ULL << 17) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 128); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 38) - 1)) ,17), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 9), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 38), svdup_u64((1ULL << 26) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 144); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 29) - 1)) ,26), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 10), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 29), svdup_u64((1ULL << 35) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 160); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 20) - 1)) ,35), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 11), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 44) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 176); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 11) - 1)) ,44), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 12), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 11), svdup_u64((1ULL << 53) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 192); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 2) - 1)) ,53), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 13), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 2), svdup_u64((1ULL << 55) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 14), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 57), svdup_u64((1ULL << 7) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 208); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 48) - 1)) ,7), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 15), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 224); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 39) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 16), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 39), svdup_u64((1ULL << 25) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 240); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 30) - 1)) ,25), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 17), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 30), svdup_u64((1ULL << 34) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 256); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 21) - 1)) ,34), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 18), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 21), svdup_u64((1ULL << 43) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 272); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,43), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 19), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 52) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 288); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 3) - 1)) ,52), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 20), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 3), svdup_u64((1ULL << 55) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 21), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 58), svdup_u64((1ULL << 6) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 304); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 49) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 22), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 49), svdup_u64((1ULL << 15) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 320); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 40) - 1)) ,15), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 23), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 336); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 31) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 24), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 31), svdup_u64((1ULL << 33) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 352); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 22) - 1)) ,33), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 25), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 22), svdup_u64((1ULL << 42) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 368); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 13) - 1)) ,42), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 26), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 13), svdup_u64((1ULL << 51) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 384); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,51), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 27), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 55) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 28), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 59), svdup_u64((1ULL << 5) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 400); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 50) - 1)) ,5), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 29), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 50), svdup_u64((1ULL << 14) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 416); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 41) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 30), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 41), svdup_u64((1ULL << 23) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 432); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,23), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 31), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 448); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 23) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 32), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 23), svdup_u64((1ULL << 41) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 464); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 14) - 1)) ,41), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 33), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 14), svdup_u64((1ULL << 50) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 480); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 5) - 1)) ,50), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 34), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 5), svdup_u64((1ULL << 55) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 35), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 496); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 51) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 36), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 51), svdup_u64((1ULL << 13) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 512); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 42) - 1)) ,13), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 37), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 42), svdup_u64((1ULL << 22) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 528); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 33) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 38), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 33), svdup_u64((1ULL << 31) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 544); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,31), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 39), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 40) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 560); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 15) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 40), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 15), svdup_u64((1ULL << 49) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 576); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 6) - 1)) ,49), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 41), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 6), svdup_u64((1ULL << 55) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 42), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 61), svdup_u64((1ULL << 3) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 592); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 52) - 1)) ,3), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 43), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 608); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 43) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 44), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 43), svdup_u64((1ULL << 21) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 624); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 34) - 1)) ,21), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 45), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 34), svdup_u64((1ULL << 30) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 640); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 25) - 1)) ,30), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 46), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 25), svdup_u64((1ULL << 39) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 656); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,39), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 47), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 48) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 672); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 7) - 1)) ,48), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 48), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 7), svdup_u64((1ULL << 55) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 49), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 62), svdup_u64((1ULL << 2) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 688); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 53) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 50), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 53), svdup_u64((1ULL << 11) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 704); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 44) - 1)) ,11), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 51), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 20) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 720); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 35) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 52), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 35), svdup_u64((1ULL << 29) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 736); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 26) - 1)) ,29), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 53), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 26), svdup_u64((1ULL << 38) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 752); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 17) - 1)) ,38), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 54), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 17), svdup_u64((1ULL << 47) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 768); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,47), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 55), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 55) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 56), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 63), svdup_u64((1ULL << 1) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 784); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 54) - 1)) ,1), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 57), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 54), svdup_u64((1ULL << 10) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 800); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 45) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 58), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 45), svdup_u64((1ULL << 19) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 816); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 36) - 1)) ,19), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 59), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 28) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 832); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 27) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 60), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 27), svdup_u64((1ULL << 37) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 848); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 18) - 1)) ,37), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 61), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 18), svdup_u64((1ULL << 46) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 864); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 9) - 1)) ,46), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 62), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 9), svdup_u64((1ULL << 55) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 63), tmp_dbl); + i += svcntd(); + pg = svwhilelt_b64(i, (1024LL / 64)); + } + while (svptest_any(svptrue_b64(), pg)); + } + static void falp_56bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] svuint64_t register_0; + [[maybe_unused]] svuint64_t tmp_0; + svbool_t pg = svwhilelt_b64(static_cast(0LL), (1024LL / 64)); + int64_t i = 0; + [[maybe_unused]] svuint64_t base_0; + do + { + register_0 = svld1(pg, in + (0 * 16) + (i) + 0); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 56) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 0), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 16); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 48) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 1), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 32); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 40) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 2), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 48); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 3), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 64); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 4), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 40) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 80); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 5), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 48) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 96); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,48), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 6), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 56) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 7), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 112); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 56) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 8), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 128); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 48) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 9), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 144); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 40) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 10), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 160); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 11), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 176); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 12), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 40) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 192); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 13), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 48) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 208); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,48), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 14), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 56) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 15), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 224); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 56) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 16), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 240); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 48) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 17), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 256); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 40) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 18), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 272); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 19), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 288); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 20), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 40) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 304); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 21), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 48) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 320); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,48), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 22), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 56) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 23), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 336); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 56) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 24), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 352); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 48) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 25), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 368); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 40) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 26), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 384); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 27), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 400); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 28), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 40) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 416); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 29), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 48) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 432); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,48), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 30), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 56) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 31), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 448); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 56) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 32), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 464); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 48) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 33), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 480); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 40) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 34), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 496); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 35), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 512); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 36), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 40) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 528); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 37), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 48) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 544); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,48), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 38), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 56) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 39), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 560); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 56) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 40), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 576); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 48) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 41), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 592); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 40) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 42), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 608); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 43), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 624); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 44), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 40) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 640); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 45), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 48) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 656); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,48), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 46), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 56) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 47), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 672); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 56) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 48), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 688); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 48) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 49), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 704); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 40) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 50), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 720); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 51), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 736); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 52), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 40) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 752); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 53), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 48) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 768); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,48), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 54), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 56) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 55), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 784); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 56) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 56), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 800); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 48) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 57), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 816); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 40) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 58), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 832); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 59), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 848); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 60), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 40) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 864); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 61), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 48) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 880); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,48), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 62), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 56) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 63), tmp_dbl); + i += svcntd(); + pg = svwhilelt_b64(i, (1024LL / 64)); + } + while (svptest_any(svptrue_b64(), pg)); + } + static void falp_57bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] svuint64_t register_0; + [[maybe_unused]] svuint64_t tmp_0; + svbool_t pg = svwhilelt_b64(static_cast(0LL), (1024LL / 64)); + int64_t i = 0; + [[maybe_unused]] svuint64_t base_0; + do + { + register_0 = svld1(pg, in + (0 * 16) + (i) + 0); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 57) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 0), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 57), svdup_u64((1ULL << 7) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 16); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 50) - 1)) ,7), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 1), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 50), svdup_u64((1ULL << 14) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 32); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 43) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 2), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 43), svdup_u64((1ULL << 21) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 48); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 36) - 1)) ,21), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 3), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 28) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 64); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 29) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 4), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 29), svdup_u64((1ULL << 35) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 80); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 22) - 1)) ,35), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 5), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 22), svdup_u64((1ULL << 42) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 96); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 15) - 1)) ,42), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 6), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 15), svdup_u64((1ULL << 49) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 112); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,49), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 7), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 56) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 128); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 1) - 1)) ,56), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 8), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 1), svdup_u64((1ULL << 57) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 9), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 58), svdup_u64((1ULL << 6) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 144); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 51) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 10), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 51), svdup_u64((1ULL << 13) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 160); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 44) - 1)) ,13), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 11), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 20) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 176); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 37) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 12), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 37), svdup_u64((1ULL << 27) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 192); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 30) - 1)) ,27), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 13), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 30), svdup_u64((1ULL << 34) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 208); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 23) - 1)) ,34), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 14), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 23), svdup_u64((1ULL << 41) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 224); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,41), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 15), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 48) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 240); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 9) - 1)) ,48), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 16), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 9), svdup_u64((1ULL << 55) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 256); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 2) - 1)) ,55), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 17), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 2), svdup_u64((1ULL << 57) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 18), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 59), svdup_u64((1ULL << 5) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 272); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 52) - 1)) ,5), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 19), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 288); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 45) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 20), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 45), svdup_u64((1ULL << 19) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 304); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 38) - 1)) ,19), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 21), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 38), svdup_u64((1ULL << 26) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 320); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 31) - 1)) ,26), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 22), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 31), svdup_u64((1ULL << 33) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 336); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,33), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 23), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 40) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 352); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 17) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 24), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 17), svdup_u64((1ULL << 47) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 368); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 10) - 1)) ,47), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 25), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 10), svdup_u64((1ULL << 54) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 384); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 3) - 1)) ,54), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 26), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 3), svdup_u64((1ULL << 57) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 27), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 400); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 53) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 28), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 53), svdup_u64((1ULL << 11) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 416); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 46) - 1)) ,11), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 29), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 46), svdup_u64((1ULL << 18) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 432); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 39) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 30), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 39), svdup_u64((1ULL << 25) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 448); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,25), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 31), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 464); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 25) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 32), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 25), svdup_u64((1ULL << 39) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 480); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 18) - 1)) ,39), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 33), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 18), svdup_u64((1ULL << 46) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 496); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 11) - 1)) ,46), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 34), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 11), svdup_u64((1ULL << 53) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 512); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,53), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 35), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 57) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 36), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 61), svdup_u64((1ULL << 3) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 528); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 54) - 1)) ,3), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 37), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 54), svdup_u64((1ULL << 10) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 544); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 47) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 38), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 47), svdup_u64((1ULL << 17) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 560); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 40) - 1)) ,17), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 39), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 576); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 33) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 40), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 33), svdup_u64((1ULL << 31) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 592); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 26) - 1)) ,31), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 41), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 26), svdup_u64((1ULL << 38) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 608); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 19) - 1)) ,38), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 42), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 19), svdup_u64((1ULL << 45) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 624); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,45), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 43), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 52) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 640); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 5) - 1)) ,52), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 44), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 5), svdup_u64((1ULL << 57) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 45), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 62), svdup_u64((1ULL << 2) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 656); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 55) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 46), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 55), svdup_u64((1ULL << 9) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 672); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 48) - 1)) ,9), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 47), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 688); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 41) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 48), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 41), svdup_u64((1ULL << 23) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 704); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 34) - 1)) ,23), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 49), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 34), svdup_u64((1ULL << 30) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 720); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 27) - 1)) ,30), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 50), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 27), svdup_u64((1ULL << 37) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 736); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 20) - 1)) ,37), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 51), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 44) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 752); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 13) - 1)) ,44), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 52), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 13), svdup_u64((1ULL << 51) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 768); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 6) - 1)) ,51), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 53), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 6), svdup_u64((1ULL << 57) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 54), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 63), svdup_u64((1ULL << 1) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 784); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 56) - 1)) ,1), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 55), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 800); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 49) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 56), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 49), svdup_u64((1ULL << 15) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 816); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 42) - 1)) ,15), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 57), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 42), svdup_u64((1ULL << 22) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 832); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 35) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 58), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 35), svdup_u64((1ULL << 29) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 848); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 28) - 1)) ,29), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 59), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 36) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 864); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 21) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 60), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 21), svdup_u64((1ULL << 43) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 880); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 14) - 1)) ,43), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 61), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 14), svdup_u64((1ULL << 50) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 896); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 7) - 1)) ,50), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 62), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 7), svdup_u64((1ULL << 57) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 63), tmp_dbl); + i += svcntd(); + pg = svwhilelt_b64(i, (1024LL / 64)); + } + while (svptest_any(svptrue_b64(), pg)); + } + static void falp_58bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] svuint64_t register_0; + [[maybe_unused]] svuint64_t tmp_0; + svbool_t pg = svwhilelt_b64(static_cast(0LL), (1024LL / 64)); + int64_t i = 0; + [[maybe_unused]] svuint64_t base_0; + do + { + register_0 = svld1(pg, in + (0 * 16) + (i) + 0); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 58) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 0), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 58), svdup_u64((1ULL << 6) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 16); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 52) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 1), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 32); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 46) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 2), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 46), svdup_u64((1ULL << 18) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 48); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 40) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 3), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 64); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 34) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 4), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 34), svdup_u64((1ULL << 30) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 80); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 28) - 1)) ,30), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 5), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 36) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 96); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 22) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 6), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 22), svdup_u64((1ULL << 42) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 112); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,42), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 7), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 48) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 128); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 10) - 1)) ,48), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 8), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 10), svdup_u64((1ULL << 54) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 144); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,54), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 9), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 58) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 10), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 62), svdup_u64((1ULL << 2) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 160); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 56) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 11), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 176); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 50) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 12), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 50), svdup_u64((1ULL << 14) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 192); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 44) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 13), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 20) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 208); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 38) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 14), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 38), svdup_u64((1ULL << 26) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 224); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,26), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 15), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 240); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 26) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 16), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 26), svdup_u64((1ULL << 38) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 256); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 20) - 1)) ,38), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 17), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 44) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 272); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 14) - 1)) ,44), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 18), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 14), svdup_u64((1ULL << 50) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 288); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,50), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 19), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 56) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 304); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 2) - 1)) ,56), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 20), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 2), svdup_u64((1ULL << 58) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 21), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 320); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 54) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 22), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 54), svdup_u64((1ULL << 10) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 336); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 48) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 23), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 352); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 42) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 24), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 42), svdup_u64((1ULL << 22) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 368); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 36) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 25), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 28) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 384); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 30) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 26), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 30), svdup_u64((1ULL << 34) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 400); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,34), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 27), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 40) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 416); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 18) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 28), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 18), svdup_u64((1ULL << 46) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 432); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,46), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 29), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 52) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 448); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 6) - 1)) ,52), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 30), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 6), svdup_u64((1ULL << 58) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 31), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 464); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 58) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 32), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 58), svdup_u64((1ULL << 6) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 480); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 52) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 33), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 496); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 46) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 34), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 46), svdup_u64((1ULL << 18) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 512); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 40) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 35), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 528); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 34) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 36), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 34), svdup_u64((1ULL << 30) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 544); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 28) - 1)) ,30), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 37), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 36) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 560); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 22) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 38), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 22), svdup_u64((1ULL << 42) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 576); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,42), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 39), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 48) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 592); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 10) - 1)) ,48), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 40), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 10), svdup_u64((1ULL << 54) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 608); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,54), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 41), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 58) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 42), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 62), svdup_u64((1ULL << 2) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 624); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 56) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 43), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 640); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 50) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 44), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 50), svdup_u64((1ULL << 14) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 656); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 44) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 45), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 20) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 672); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 38) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 46), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 38), svdup_u64((1ULL << 26) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 688); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,26), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 47), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 704); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 26) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 48), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 26), svdup_u64((1ULL << 38) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 720); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 20) - 1)) ,38), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 49), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 44) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 736); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 14) - 1)) ,44), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 50), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 14), svdup_u64((1ULL << 50) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 752); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,50), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 51), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 56) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 768); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 2) - 1)) ,56), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 52), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 2), svdup_u64((1ULL << 58) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 53), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 784); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 54) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 54), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 54), svdup_u64((1ULL << 10) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 800); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 48) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 55), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 816); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 42) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 56), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 42), svdup_u64((1ULL << 22) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 832); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 36) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 57), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 28) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 848); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 30) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 58), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 30), svdup_u64((1ULL << 34) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 864); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,34), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 59), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 40) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 880); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 18) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 60), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 18), svdup_u64((1ULL << 46) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 896); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,46), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 61), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 52) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 912); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 6) - 1)) ,52), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 62), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 6), svdup_u64((1ULL << 58) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 63), tmp_dbl); + i += svcntd(); + pg = svwhilelt_b64(i, (1024LL / 64)); + } + while (svptest_any(svptrue_b64(), pg)); + } + static void falp_59bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] svuint64_t register_0; + [[maybe_unused]] svuint64_t tmp_0; + svbool_t pg = svwhilelt_b64(static_cast(0LL), (1024LL / 64)); + int64_t i = 0; + [[maybe_unused]] svuint64_t base_0; + do + { + register_0 = svld1(pg, in + (0 * 16) + (i) + 0); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 59) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 0), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 59), svdup_u64((1ULL << 5) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 16); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 54) - 1)) ,5), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 1), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 54), svdup_u64((1ULL << 10) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 32); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 49) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 2), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 49), svdup_u64((1ULL << 15) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 48); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 44) - 1)) ,15), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 3), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 20) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 64); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 39) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 4), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 39), svdup_u64((1ULL << 25) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 80); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 34) - 1)) ,25), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 5), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 34), svdup_u64((1ULL << 30) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 96); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 29) - 1)) ,30), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 6), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 29), svdup_u64((1ULL << 35) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 112); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,35), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 7), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 40) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 128); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 19) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 8), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 19), svdup_u64((1ULL << 45) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 144); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 14) - 1)) ,45), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 9), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 14), svdup_u64((1ULL << 50) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 160); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 9) - 1)) ,50), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 10), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 9), svdup_u64((1ULL << 55) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 176); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,55), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 11), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 59) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 12), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 63), svdup_u64((1ULL << 1) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 192); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 58) - 1)) ,1), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 13), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 58), svdup_u64((1ULL << 6) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 208); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 53) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 14), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 53), svdup_u64((1ULL << 11) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 224); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 48) - 1)) ,11), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 15), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 240); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 43) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 16), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 43), svdup_u64((1ULL << 21) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 256); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 38) - 1)) ,21), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 17), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 38), svdup_u64((1ULL << 26) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 272); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 33) - 1)) ,26), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 18), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 33), svdup_u64((1ULL << 31) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 288); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 28) - 1)) ,31), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 19), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 36) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 304); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 23) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 20), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 23), svdup_u64((1ULL << 41) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 320); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 18) - 1)) ,41), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 21), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 18), svdup_u64((1ULL << 46) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 336); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 13) - 1)) ,46), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 22), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 13), svdup_u64((1ULL << 51) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 352); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,51), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 23), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 56) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 368); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 3) - 1)) ,56), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 24), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 3), svdup_u64((1ULL << 59) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 25), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 62), svdup_u64((1ULL << 2) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 384); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 57) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 26), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 57), svdup_u64((1ULL << 7) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 400); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 52) - 1)) ,7), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 27), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 416); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 47) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 28), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 47), svdup_u64((1ULL << 17) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 432); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 42) - 1)) ,17), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 29), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 42), svdup_u64((1ULL << 22) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 448); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 37) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 30), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 37), svdup_u64((1ULL << 27) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 464); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,27), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 31), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 480); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 27) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 32), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 27), svdup_u64((1ULL << 37) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 496); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 22) - 1)) ,37), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 33), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 22), svdup_u64((1ULL << 42) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 512); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 17) - 1)) ,42), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 34), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 17), svdup_u64((1ULL << 47) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 528); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,47), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 35), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 52) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 544); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 7) - 1)) ,52), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 36), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 7), svdup_u64((1ULL << 57) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 560); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 2) - 1)) ,57), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 37), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 2), svdup_u64((1ULL << 59) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 38), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 61), svdup_u64((1ULL << 3) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 576); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 56) - 1)) ,3), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 39), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 592); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 51) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 40), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 51), svdup_u64((1ULL << 13) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 608); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 46) - 1)) ,13), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 41), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 46), svdup_u64((1ULL << 18) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 624); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 41) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 42), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 41), svdup_u64((1ULL << 23) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 640); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 36) - 1)) ,23), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 43), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 28) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 656); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 31) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 44), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 31), svdup_u64((1ULL << 33) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 672); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 26) - 1)) ,33), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 45), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 26), svdup_u64((1ULL << 38) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 688); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 21) - 1)) ,38), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 46), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 21), svdup_u64((1ULL << 43) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 704); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,43), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 47), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 48) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 720); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 11) - 1)) ,48), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 48), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 11), svdup_u64((1ULL << 53) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 736); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 6) - 1)) ,53), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 49), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 6), svdup_u64((1ULL << 58) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 752); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 1) - 1)) ,58), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 50), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 1), svdup_u64((1ULL << 59) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 51), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 768); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 55) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 52), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 55), svdup_u64((1ULL << 9) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 784); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 50) - 1)) ,9), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 53), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 50), svdup_u64((1ULL << 14) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 800); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 45) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 54), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 45), svdup_u64((1ULL << 19) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 816); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 40) - 1)) ,19), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 55), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 832); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 35) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 56), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 35), svdup_u64((1ULL << 29) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 848); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 30) - 1)) ,29), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 57), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 30), svdup_u64((1ULL << 34) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 864); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 25) - 1)) ,34), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 58), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 25), svdup_u64((1ULL << 39) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 880); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 20) - 1)) ,39), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 59), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 44) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 896); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 15) - 1)) ,44), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 60), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 15), svdup_u64((1ULL << 49) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 912); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 10) - 1)) ,49), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 61), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 10), svdup_u64((1ULL << 54) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 928); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 5) - 1)) ,54), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 62), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 5), svdup_u64((1ULL << 59) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 63), tmp_dbl); + i += svcntd(); + pg = svwhilelt_b64(i, (1024LL / 64)); + } + while (svptest_any(svptrue_b64(), pg)); + } + static void falp_60bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] svuint64_t register_0; + [[maybe_unused]] svuint64_t tmp_0; + svbool_t pg = svwhilelt_b64(static_cast(0LL), (1024LL / 64)); + int64_t i = 0; + [[maybe_unused]] svuint64_t base_0; + do + { + register_0 = svld1(pg, in + (0 * 16) + (i) + 0); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 60) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 0), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 16); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 56) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 1), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 32); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 52) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 2), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 48); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 48) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 3), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 64); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 44) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 4), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 20) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 80); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 40) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 5), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 96); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 36) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 6), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 28) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 112); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 7), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 128); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 28) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 8), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 36) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 144); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 9), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 40) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 160); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 20) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 10), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 44) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 176); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,44), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 11), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 48) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 192); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,48), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 12), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 52) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 208); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,52), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 13), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 56) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 224); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,56), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 14), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 60) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 15), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 240); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 60) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 16), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 256); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 56) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 17), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 272); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 52) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 18), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 288); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 48) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 19), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 304); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 44) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 20), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 20) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 320); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 40) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 21), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 336); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 36) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 22), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 28) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 352); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 23), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 368); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 28) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 24), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 36) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 384); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 25), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 40) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 400); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 20) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 26), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 44) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 416); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,44), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 27), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 48) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 432); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,48), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 28), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 52) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 448); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,52), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 29), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 56) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 464); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,56), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 30), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 60) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 31), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 480); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 60) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 32), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 496); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 56) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 33), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 512); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 52) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 34), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 528); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 48) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 35), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 544); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 44) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 36), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 20) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 560); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 40) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 37), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 576); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 36) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 38), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 28) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 592); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 39), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 608); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 28) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 40), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 36) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 624); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 41), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 40) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 640); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 20) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 42), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 44) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 656); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,44), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 43), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 48) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 672); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,48), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 44), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 52) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 688); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,52), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 45), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 56) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 704); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,56), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 46), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 60) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 47), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 720); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 60) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 48), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 736); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 56) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 49), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 752); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 52) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 50), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 768); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 48) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 51), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 784); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 44) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 52), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 20) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 800); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 40) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 53), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 816); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 36) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 54), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 28) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 832); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 55), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 848); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 28) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 56), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 36) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 864); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 57), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 40) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 880); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 20) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 58), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 44) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 896); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,44), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 59), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 48) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 912); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,48), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 60), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 52) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 928); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,52), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 61), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 56) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 944); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,56), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 62), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 60) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 63), tmp_dbl); + i += svcntd(); + pg = svwhilelt_b64(i, (1024LL / 64)); + } + while (svptest_any(svptrue_b64(), pg)); + } + static void falp_61bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] svuint64_t register_0; + [[maybe_unused]] svuint64_t tmp_0; + svbool_t pg = svwhilelt_b64(static_cast(0LL), (1024LL / 64)); + int64_t i = 0; + [[maybe_unused]] svuint64_t base_0; + do + { + register_0 = svld1(pg, in + (0 * 16) + (i) + 0); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 61) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 0), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 61), svdup_u64((1ULL << 3) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 16); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 58) - 1)) ,3), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 1), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 58), svdup_u64((1ULL << 6) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 32); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 55) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 2), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 55), svdup_u64((1ULL << 9) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 48); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 52) - 1)) ,9), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 3), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 64); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 49) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 4), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 49), svdup_u64((1ULL << 15) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 80); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 46) - 1)) ,15), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 5), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 46), svdup_u64((1ULL << 18) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 96); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 43) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 6), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 43), svdup_u64((1ULL << 21) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 112); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 40) - 1)) ,21), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 7), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 128); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 37) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 8), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 37), svdup_u64((1ULL << 27) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 144); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 34) - 1)) ,27), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 9), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 34), svdup_u64((1ULL << 30) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 160); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 31) - 1)) ,30), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 10), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 31), svdup_u64((1ULL << 33) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 176); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 28) - 1)) ,33), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 11), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 36) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 192); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 25) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 12), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 25), svdup_u64((1ULL << 39) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 208); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 22) - 1)) ,39), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 13), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 22), svdup_u64((1ULL << 42) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 224); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 19) - 1)) ,42), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 14), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 19), svdup_u64((1ULL << 45) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 240); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,45), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 15), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 48) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 256); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 13) - 1)) ,48), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 16), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 13), svdup_u64((1ULL << 51) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 272); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 10) - 1)) ,51), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 17), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 10), svdup_u64((1ULL << 54) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 288); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 7) - 1)) ,54), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 18), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 7), svdup_u64((1ULL << 57) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 304); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,57), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 19), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 60) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 320); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 1) - 1)) ,60), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 20), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 1), svdup_u64((1ULL << 61) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 21), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 62), svdup_u64((1ULL << 2) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 336); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 59) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 22), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 59), svdup_u64((1ULL << 5) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 352); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 56) - 1)) ,5), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 23), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 368); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 53) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 24), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 53), svdup_u64((1ULL << 11) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 384); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 50) - 1)) ,11), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 25), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 50), svdup_u64((1ULL << 14) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 400); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 47) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 26), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 47), svdup_u64((1ULL << 17) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 416); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 44) - 1)) ,17), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 27), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 20) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 432); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 41) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 28), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 41), svdup_u64((1ULL << 23) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 448); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 38) - 1)) ,23), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 29), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 38), svdup_u64((1ULL << 26) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 464); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 35) - 1)) ,26), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 30), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 35), svdup_u64((1ULL << 29) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 480); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,29), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 31), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 496); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 29) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 32), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 29), svdup_u64((1ULL << 35) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 512); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 26) - 1)) ,35), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 33), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 26), svdup_u64((1ULL << 38) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 528); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 23) - 1)) ,38), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 34), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 23), svdup_u64((1ULL << 41) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 544); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 20) - 1)) ,41), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 35), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 44) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 560); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 17) - 1)) ,44), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 36), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 17), svdup_u64((1ULL << 47) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 576); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 14) - 1)) ,47), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 37), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 14), svdup_u64((1ULL << 50) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 592); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 11) - 1)) ,50), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 38), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 11), svdup_u64((1ULL << 53) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 608); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,53), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 39), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 56) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 624); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 5) - 1)) ,56), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 40), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 5), svdup_u64((1ULL << 59) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 640); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 2) - 1)) ,59), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 41), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 2), svdup_u64((1ULL << 61) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 42), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 63), svdup_u64((1ULL << 1) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 656); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 60) - 1)) ,1), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 43), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 672); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 57) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 44), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 57), svdup_u64((1ULL << 7) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 688); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 54) - 1)) ,7), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 45), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 54), svdup_u64((1ULL << 10) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 704); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 51) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 46), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 51), svdup_u64((1ULL << 13) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 720); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 48) - 1)) ,13), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 47), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 736); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 45) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 48), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 45), svdup_u64((1ULL << 19) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 752); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 42) - 1)) ,19), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 49), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 42), svdup_u64((1ULL << 22) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 768); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 39) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 50), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 39), svdup_u64((1ULL << 25) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 784); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 36) - 1)) ,25), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 51), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 28) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 800); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 33) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 52), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 33), svdup_u64((1ULL << 31) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 816); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 30) - 1)) ,31), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 53), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 30), svdup_u64((1ULL << 34) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 832); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 27) - 1)) ,34), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 54), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 27), svdup_u64((1ULL << 37) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 848); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,37), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 55), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 40) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 864); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 21) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 56), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 21), svdup_u64((1ULL << 43) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 880); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 18) - 1)) ,43), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 57), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 18), svdup_u64((1ULL << 46) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 896); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 15) - 1)) ,46), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 58), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 15), svdup_u64((1ULL << 49) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 912); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,49), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 59), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 52) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 928); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 9) - 1)) ,52), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 60), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 9), svdup_u64((1ULL << 55) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 944); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 6) - 1)) ,55), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 61), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 6), svdup_u64((1ULL << 58) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 960); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 3) - 1)) ,58), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 62), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 3), svdup_u64((1ULL << 61) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 63), tmp_dbl); + i += svcntd(); + pg = svwhilelt_b64(i, (1024LL / 64)); + } + while (svptest_any(svptrue_b64(), pg)); + } + static void falp_62bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] svuint64_t register_0; + [[maybe_unused]] svuint64_t tmp_0; + svbool_t pg = svwhilelt_b64(static_cast(0LL), (1024LL / 64)); + int64_t i = 0; + [[maybe_unused]] svuint64_t base_0; + do + { + register_0 = svld1(pg, in + (0 * 16) + (i) + 0); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 62) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 0), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 62), svdup_u64((1ULL << 2) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 16); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 60) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 1), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 32); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 58) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 2), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 58), svdup_u64((1ULL << 6) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 48); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 56) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 3), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 64); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 54) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 4), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 54), svdup_u64((1ULL << 10) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 80); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 52) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 5), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 96); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 50) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 6), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 50), svdup_u64((1ULL << 14) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 112); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 48) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 7), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 128); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 46) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 8), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 46), svdup_u64((1ULL << 18) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 144); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 44) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 9), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 20) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 160); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 42) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 10), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 42), svdup_u64((1ULL << 22) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 176); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 40) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 11), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 192); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 38) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 12), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 38), svdup_u64((1ULL << 26) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 208); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 36) - 1)) ,26), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 13), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 28) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 224); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 34) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 14), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 34), svdup_u64((1ULL << 30) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 240); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,30), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 15), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 256); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 30) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 16), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 30), svdup_u64((1ULL << 34) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 272); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 28) - 1)) ,34), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 17), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 36) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 288); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 26) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 18), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 26), svdup_u64((1ULL << 38) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 304); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,38), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 19), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 40) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 320); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 22) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 20), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 22), svdup_u64((1ULL << 42) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 336); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 20) - 1)) ,42), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 21), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 44) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 352); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 18) - 1)) ,44), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 22), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 18), svdup_u64((1ULL << 46) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 368); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,46), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 23), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 48) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 384); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 14) - 1)) ,48), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 24), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 14), svdup_u64((1ULL << 50) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 400); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,50), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 25), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 52) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 416); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 10) - 1)) ,52), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 26), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 10), svdup_u64((1ULL << 54) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 432); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,54), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 27), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 56) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 448); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 6) - 1)) ,56), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 28), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 6), svdup_u64((1ULL << 58) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 464); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,58), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 29), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 60) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 480); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 2) - 1)) ,60), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 30), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 2), svdup_u64((1ULL << 62) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 31), tmp_dbl); + register_0 = svld1(pg, in + (0 * 16) + (i) + 496); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 62) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 32), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 62), svdup_u64((1ULL << 2) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 512); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 60) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 33), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 528); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 58) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 34), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 58), svdup_u64((1ULL << 6) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 544); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 56) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 35), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 560); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 54) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 36), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 54), svdup_u64((1ULL << 10) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 576); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 52) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 37), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 592); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 50) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 38), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 50), svdup_u64((1ULL << 14) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 608); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 48) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 39), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 624); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 46) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 40), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 46), svdup_u64((1ULL << 18) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 640); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 44) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 41), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 20) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 656); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 42) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 42), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 42), svdup_u64((1ULL << 22) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 672); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 40) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 43), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 688); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 38) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 44), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 38), svdup_u64((1ULL << 26) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 704); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 36) - 1)) ,26), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 45), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 28) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 720); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 34) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 46), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 34), svdup_u64((1ULL << 30) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 736); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,30), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 47), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 752); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 30) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 48), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 30), svdup_u64((1ULL << 34) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 768); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 28) - 1)) ,34), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 49), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 36) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 784); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 26) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 50), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 26), svdup_u64((1ULL << 38) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 800); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,38), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 51), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 40) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 816); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 22) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 52), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 22), svdup_u64((1ULL << 42) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 832); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 20) - 1)) ,42), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 53), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 44) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 848); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 18) - 1)) ,44), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 54), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 18), svdup_u64((1ULL << 46) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 864); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,46), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 55), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 48) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 880); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 14) - 1)) ,48), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 56), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 14), svdup_u64((1ULL << 50) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 896); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,50), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 57), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 52) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 912); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 10) - 1)) ,52), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 58), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 10), svdup_u64((1ULL << 54) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 928); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,54), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 59), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 56) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 944); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 6) - 1)) ,56), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 60), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 6), svdup_u64((1ULL << 58) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 960); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,58), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 61), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 60) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 976); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 2) - 1)) ,60), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 62), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 2), svdup_u64((1ULL << 62) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 63), tmp_dbl); + i += svcntd(); + pg = svwhilelt_b64(i, (1024LL / 64)); + } + while (svptest_any(svptrue_b64(), pg)); + } + static void falp_63bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] svuint64_t register_0; + [[maybe_unused]] svuint64_t tmp_0; + svbool_t pg = svwhilelt_b64(static_cast(0LL), (1024LL / 64)); + int64_t i = 0; + [[maybe_unused]] svuint64_t base_0; + do + { + register_0 = svld1(pg, in + (0 * 16) + (i) + 0); + tmp_0 = svand_u64_x(pg, register_0, svdup_u64((1ULL << 63) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 0), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 63), svdup_u64((1ULL << 1) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 16); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 62) - 1)) ,1), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 1), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 62), svdup_u64((1ULL << 2) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 32); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 61) - 1)) ,2), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 2), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 61), svdup_u64((1ULL << 3) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 48); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 60) - 1)) ,3), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 3), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 60), svdup_u64((1ULL << 4) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 64); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 59) - 1)) ,4), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 4), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 59), svdup_u64((1ULL << 5) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 80); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 58) - 1)) ,5), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 5), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 58), svdup_u64((1ULL << 6) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 96); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 57) - 1)) ,6), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 6), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 57), svdup_u64((1ULL << 7) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 112); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 56) - 1)) ,7), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 7), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 56), svdup_u64((1ULL << 8) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 128); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 55) - 1)) ,8), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 8), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 55), svdup_u64((1ULL << 9) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 144); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 54) - 1)) ,9), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 9), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 54), svdup_u64((1ULL << 10) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 160); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 53) - 1)) ,10), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 10), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 53), svdup_u64((1ULL << 11) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 176); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 52) - 1)) ,11), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 11), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 52), svdup_u64((1ULL << 12) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 192); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 51) - 1)) ,12), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 12), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 51), svdup_u64((1ULL << 13) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 208); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 50) - 1)) ,13), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 13), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 50), svdup_u64((1ULL << 14) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 224); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 49) - 1)) ,14), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 14), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 49), svdup_u64((1ULL << 15) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 240); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 48) - 1)) ,15), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 15), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 48), svdup_u64((1ULL << 16) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 256); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 47) - 1)) ,16), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 16), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 47), svdup_u64((1ULL << 17) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 272); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 46) - 1)) ,17), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 17), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 46), svdup_u64((1ULL << 18) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 288); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 45) - 1)) ,18), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 18), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 45), svdup_u64((1ULL << 19) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 304); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 44) - 1)) ,19), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 19), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 44), svdup_u64((1ULL << 20) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 320); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 43) - 1)) ,20), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 20), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 43), svdup_u64((1ULL << 21) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 336); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 42) - 1)) ,21), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 21), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 42), svdup_u64((1ULL << 22) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 352); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 41) - 1)) ,22), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 22), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 41), svdup_u64((1ULL << 23) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 368); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 40) - 1)) ,23), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 23), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 40), svdup_u64((1ULL << 24) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 384); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 39) - 1)) ,24), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 24), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 39), svdup_u64((1ULL << 25) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 400); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 38) - 1)) ,25), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 25), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 38), svdup_u64((1ULL << 26) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 416); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 37) - 1)) ,26), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 26), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 37), svdup_u64((1ULL << 27) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 432); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 36) - 1)) ,27), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 27), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 36), svdup_u64((1ULL << 28) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 448); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 35) - 1)) ,28), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 28), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 35), svdup_u64((1ULL << 29) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 464); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 34) - 1)) ,29), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 29), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 34), svdup_u64((1ULL << 30) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 480); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 33) - 1)) ,30), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 30), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 33), svdup_u64((1ULL << 31) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 496); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 32) - 1)) ,31), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 31), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 32), svdup_u64((1ULL << 32) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 512); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 31) - 1)) ,32), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 32), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 31), svdup_u64((1ULL << 33) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 528); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 30) - 1)) ,33), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 33), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 30), svdup_u64((1ULL << 34) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 544); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 29) - 1)) ,34), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 34), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 29), svdup_u64((1ULL << 35) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 560); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 28) - 1)) ,35), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 35), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 28), svdup_u64((1ULL << 36) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 576); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 27) - 1)) ,36), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 36), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 27), svdup_u64((1ULL << 37) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 592); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 26) - 1)) ,37), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 37), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 26), svdup_u64((1ULL << 38) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 608); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 25) - 1)) ,38), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 38), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 25), svdup_u64((1ULL << 39) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 624); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 24) - 1)) ,39), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 39), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 24), svdup_u64((1ULL << 40) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 640); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 23) - 1)) ,40), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 40), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 23), svdup_u64((1ULL << 41) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 656); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 22) - 1)) ,41), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 41), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 22), svdup_u64((1ULL << 42) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 672); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 21) - 1)) ,42), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 42), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 21), svdup_u64((1ULL << 43) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 688); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 20) - 1)) ,43), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 43), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 20), svdup_u64((1ULL << 44) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 704); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 19) - 1)) ,44), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 44), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 19), svdup_u64((1ULL << 45) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 720); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 18) - 1)) ,45), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 45), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 18), svdup_u64((1ULL << 46) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 736); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 17) - 1)) ,46), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 46), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 17), svdup_u64((1ULL << 47) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 752); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 16) - 1)) ,47), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 47), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 16), svdup_u64((1ULL << 48) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 768); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 15) - 1)) ,48), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 48), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 15), svdup_u64((1ULL << 49) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 784); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 14) - 1)) ,49), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 49), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 14), svdup_u64((1ULL << 50) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 800); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 13) - 1)) ,50), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 50), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 13), svdup_u64((1ULL << 51) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 816); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 12) - 1)) ,51), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 51), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 12), svdup_u64((1ULL << 52) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 832); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 11) - 1)) ,52), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 52), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 11), svdup_u64((1ULL << 53) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 848); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 10) - 1)) ,53), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 53), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 10), svdup_u64((1ULL << 54) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 864); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 9) - 1)) ,54), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 54), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 9), svdup_u64((1ULL << 55) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 880); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 8) - 1)) ,55), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 55), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 8), svdup_u64((1ULL << 56) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 896); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 7) - 1)) ,56), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 56), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 7), svdup_u64((1ULL << 57) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 912); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 6) - 1)) ,57), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 57), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 6), svdup_u64((1ULL << 58) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 928); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 5) - 1)) ,58), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 58), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 5), svdup_u64((1ULL << 59) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 944); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 4) - 1)) ,59), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 59), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 4), svdup_u64((1ULL << 60) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 960); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 3) - 1)) ,60), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 60), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 3), svdup_u64((1ULL << 61) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 976); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 2) - 1)) ,61), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 61), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 2), svdup_u64((1ULL << 62) - 1)); + register_0 = svld1(pg, in + (0 * 16) + (i) + 992); + tmp_0 = svorr_u64_x(pg, svlsl_x(pg, svand_u64_x(pg, register_0, svdup_u64((1ULL << 1) - 1)) ,62), tmp_0); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 62), tmp_dbl); + tmp_0 = svand_u64_x(pg, svlsr_x(pg, register_0, 1), svdup_u64((1ULL << 63) - 1)); + tmp_0 += base_0; + tmp_0 *= factor; + FIX + tmp_dbl *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 63), tmp_dbl); + i += svcntd(); + pg = svwhilelt_b64(i, (1024LL / 64)); + } + while (svptest_any(svptrue_b64(), pg)); + } + static void falp_64bw_64ow_128crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = (a_out_p); + [[maybe_unused]] const auto in = (a_in_p); + [[maybe_unused]] svuint64_t register_0; + [[maybe_unused]] svuint64_t tmp_0; + svbool_t pg = svwhilelt_b64(static_cast(0LL), (1024LL / 64)); + int64_t i = 0; + [[maybe_unused]] svuint64_t base_0; + do + { + register_0 = svld1(pg, in + (0 * 16) + (i) + 0); + register_0 += base_0; + register_0 *= factor; + FIX + register_0 *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 0), register_0); + register_0 = svld1(pg, in + (0 * 16) + (i) + 16); + register_0 += base_0; + register_0 *= factor; + FIX + register_0 *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 1), register_0); + register_0 = svld1(pg, in + (0 * 16) + (i) + 32); + register_0 += base_0; + register_0 *= factor; + FIX + register_0 *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 2), register_0); + register_0 = svld1(pg, in + (0 * 16) + (i) + 48); + register_0 += base_0; + register_0 *= factor; + FIX + register_0 *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 3), register_0); + register_0 = svld1(pg, in + (0 * 16) + (i) + 64); + register_0 += base_0; + register_0 *= factor; + FIX + register_0 *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 4), register_0); + register_0 = svld1(pg, in + (0 * 16) + (i) + 80); + register_0 += base_0; + register_0 *= factor; + FIX + register_0 *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 5), register_0); + register_0 = svld1(pg, in + (0 * 16) + (i) + 96); + register_0 += base_0; + register_0 *= factor; + FIX + register_0 *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 6), register_0); + register_0 = svld1(pg, in + (0 * 16) + (i) + 112); + register_0 += base_0; + register_0 *= factor; + FIX + register_0 *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 7), register_0); + register_0 = svld1(pg, in + (0 * 16) + (i) + 128); + register_0 += base_0; + register_0 *= factor; + FIX + register_0 *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 8), register_0); + register_0 = svld1(pg, in + (0 * 16) + (i) + 144); + register_0 += base_0; + register_0 *= factor; + FIX + register_0 *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 9), register_0); + register_0 = svld1(pg, in + (0 * 16) + (i) + 160); + register_0 += base_0; + register_0 *= factor; + FIX + register_0 *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 10), register_0); + register_0 = svld1(pg, in + (0 * 16) + (i) + 176); + register_0 += base_0; + register_0 *= factor; + FIX + register_0 *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 11), register_0); + register_0 = svld1(pg, in + (0 * 16) + (i) + 192); + register_0 += base_0; + register_0 *= factor; + FIX + register_0 *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 12), register_0); + register_0 = svld1(pg, in + (0 * 16) + (i) + 208); + register_0 += base_0; + register_0 *= factor; + FIX + register_0 *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 13), register_0); + register_0 = svld1(pg, in + (0 * 16) + (i) + 224); + register_0 += base_0; + register_0 *= factor; + FIX + register_0 *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 14), register_0); + register_0 = svld1(pg, in + (0 * 16) + (i) + 240); + register_0 += base_0; + register_0 *= factor; + FIX + register_0 *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 15), register_0); + register_0 = svld1(pg, in + (0 * 16) + (i) + 256); + register_0 += base_0; + register_0 *= factor; + FIX + register_0 *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 16), register_0); + register_0 = svld1(pg, in + (0 * 16) + (i) + 272); + register_0 += base_0; + register_0 *= factor; + FIX + register_0 *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 17), register_0); + register_0 = svld1(pg, in + (0 * 16) + (i) + 288); + register_0 += base_0; + register_0 *= factor; + FIX + register_0 *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 18), register_0); + register_0 = svld1(pg, in + (0 * 16) + (i) + 304); + register_0 += base_0; + register_0 *= factor; + FIX + register_0 *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 19), register_0); + register_0 = svld1(pg, in + (0 * 16) + (i) + 320); + register_0 += base_0; + register_0 *= factor; + FIX + register_0 *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 20), register_0); + register_0 = svld1(pg, in + (0 * 16) + (i) + 336); + register_0 += base_0; + register_0 *= factor; + FIX + register_0 *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 21), register_0); + register_0 = svld1(pg, in + (0 * 16) + (i) + 352); + register_0 += base_0; + register_0 *= factor; + FIX + register_0 *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 22), register_0); + register_0 = svld1(pg, in + (0 * 16) + (i) + 368); + register_0 += base_0; + register_0 *= factor; + FIX + register_0 *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 23), register_0); + register_0 = svld1(pg, in + (0 * 16) + (i) + 384); + register_0 += base_0; + register_0 *= factor; + FIX + register_0 *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 24), register_0); + register_0 = svld1(pg, in + (0 * 16) + (i) + 400); + register_0 += base_0; + register_0 *= factor; + FIX + register_0 *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 25), register_0); + register_0 = svld1(pg, in + (0 * 16) + (i) + 416); + register_0 += base_0; + register_0 *= factor; + FIX + register_0 *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 26), register_0); + register_0 = svld1(pg, in + (0 * 16) + (i) + 432); + register_0 += base_0; + register_0 *= factor; + FIX + register_0 *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 27), register_0); + register_0 = svld1(pg, in + (0 * 16) + (i) + 448); + register_0 += base_0; + register_0 *= factor; + FIX + register_0 *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 28), register_0); + register_0 = svld1(pg, in + (0 * 16) + (i) + 464); + register_0 += base_0; + register_0 *= factor; + FIX + register_0 *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 29), register_0); + register_0 = svld1(pg, in + (0 * 16) + (i) + 480); + register_0 += base_0; + register_0 *= factor; + FIX + register_0 *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 30), register_0); + register_0 = svld1(pg, in + (0 * 16) + (i) + 496); + register_0 += base_0; + register_0 *= factor; + FIX + register_0 *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 31), register_0); + register_0 = svld1(pg, in + (0 * 16) + (i) + 512); + register_0 += base_0; + register_0 *= factor; + FIX + register_0 *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 32), register_0); + register_0 = svld1(pg, in + (0 * 16) + (i) + 528); + register_0 += base_0; + register_0 *= factor; + FIX + register_0 *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 33), register_0); + register_0 = svld1(pg, in + (0 * 16) + (i) + 544); + register_0 += base_0; + register_0 *= factor; + FIX + register_0 *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 34), register_0); + register_0 = svld1(pg, in + (0 * 16) + (i) + 560); + register_0 += base_0; + register_0 *= factor; + FIX + register_0 *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 35), register_0); + register_0 = svld1(pg, in + (0 * 16) + (i) + 576); + register_0 += base_0; + register_0 *= factor; + FIX + register_0 *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 36), register_0); + register_0 = svld1(pg, in + (0 * 16) + (i) + 592); + register_0 += base_0; + register_0 *= factor; + FIX + register_0 *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 37), register_0); + register_0 = svld1(pg, in + (0 * 16) + (i) + 608); + register_0 += base_0; + register_0 *= factor; + FIX + register_0 *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 38), register_0); + register_0 = svld1(pg, in + (0 * 16) + (i) + 624); + register_0 += base_0; + register_0 *= factor; + FIX + register_0 *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 39), register_0); + register_0 = svld1(pg, in + (0 * 16) + (i) + 640); + register_0 += base_0; + register_0 *= factor; + FIX + register_0 *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 40), register_0); + register_0 = svld1(pg, in + (0 * 16) + (i) + 656); + register_0 += base_0; + register_0 *= factor; + FIX + register_0 *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 41), register_0); + register_0 = svld1(pg, in + (0 * 16) + (i) + 672); + register_0 += base_0; + register_0 *= factor; + FIX + register_0 *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 42), register_0); + register_0 = svld1(pg, in + (0 * 16) + (i) + 688); + register_0 += base_0; + register_0 *= factor; + FIX + register_0 *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 43), register_0); + register_0 = svld1(pg, in + (0 * 16) + (i) + 704); + register_0 += base_0; + register_0 *= factor; + FIX + register_0 *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 44), register_0); + register_0 = svld1(pg, in + (0 * 16) + (i) + 720); + register_0 += base_0; + register_0 *= factor; + FIX + register_0 *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 45), register_0); + register_0 = svld1(pg, in + (0 * 16) + (i) + 736); + register_0 += base_0; + register_0 *= factor; + FIX + register_0 *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 46), register_0); + register_0 = svld1(pg, in + (0 * 16) + (i) + 752); + register_0 += base_0; + register_0 *= factor; + FIX + register_0 *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 47), register_0); + register_0 = svld1(pg, in + (0 * 16) + (i) + 768); + register_0 += base_0; + register_0 *= factor; + FIX + register_0 *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 48), register_0); + register_0 = svld1(pg, in + (0 * 16) + (i) + 784); + register_0 += base_0; + register_0 *= factor; + FIX + register_0 *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 49), register_0); + register_0 = svld1(pg, in + (0 * 16) + (i) + 800); + register_0 += base_0; + register_0 *= factor; + FIX + register_0 *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 50), register_0); + register_0 = svld1(pg, in + (0 * 16) + (i) + 816); + register_0 += base_0; + register_0 *= factor; + FIX + register_0 *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 51), register_0); + register_0 = svld1(pg, in + (0 * 16) + (i) + 832); + register_0 += base_0; + register_0 *= factor; + FIX + register_0 *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 52), register_0); + register_0 = svld1(pg, in + (0 * 16) + (i) + 848); + register_0 += base_0; + register_0 *= factor; + FIX + register_0 *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 53), register_0); + register_0 = svld1(pg, in + (0 * 16) + (i) + 864); + register_0 += base_0; + register_0 *= factor; + FIX + register_0 *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 54), register_0); + register_0 = svld1(pg, in + (0 * 16) + (i) + 880); + register_0 += base_0; + register_0 *= factor; + FIX + register_0 *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 55), register_0); + register_0 = svld1(pg, in + (0 * 16) + (i) + 896); + register_0 += base_0; + register_0 *= factor; + FIX + register_0 *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 56), register_0); + register_0 = svld1(pg, in + (0 * 16) + (i) + 912); + register_0 += base_0; + register_0 *= factor; + FIX + register_0 *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 57), register_0); + register_0 = svld1(pg, in + (0 * 16) + (i) + 928); + register_0 += base_0; + register_0 *= factor; + FIX + register_0 *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 58), register_0); + register_0 = svld1(pg, in + (0 * 16) + (i) + 944); + register_0 += base_0; + register_0 *= factor; + FIX + register_0 *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 59), register_0); + register_0 = svld1(pg, in + (0 * 16) + (i) + 960); + register_0 += base_0; + register_0 *= factor; + FIX + register_0 *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 60), register_0); + register_0 = svld1(pg, in + (0 * 16) + (i) + 976); + register_0 += base_0; + register_0 *= factor; + FIX + register_0 *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 61), register_0); + register_0 = svld1(pg, in + (0 * 16) + (i) + 992); + register_0 += base_0; + register_0 *= factor; + FIX + register_0 *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 62), register_0); + register_0 = svld1(pg, in + (0 * 16) + (i) + 1008); + register_0 += base_0; + register_0 *= factor; + FIX + register_0 *= frac10; + svst1(pg, out + (i * 2) + (0 * 16 * 2) + (16 * 63), register_0); + i += svcntd(); + pg = svwhilelt_b64(i, (1024LL / 64)); + } + while (svptest_any(svptrue_b64(), pg)); + } + void falp(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, uint8_t bw, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + switch (bw) + { + case 0: + falp_0bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 1: + falp_1bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 2: + falp_2bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 3: + falp_3bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 4: + falp_4bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 5: + falp_5bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 6: + falp_6bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 7: + falp_7bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 8: + falp_8bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 9: + falp_9bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 10: + falp_10bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 11: + falp_11bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 12: + falp_12bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 13: + falp_13bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 14: + falp_14bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 15: + falp_15bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 16: + falp_16bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 17: + falp_17bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 18: + falp_18bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 19: + falp_19bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 20: + falp_20bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 21: + falp_21bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 22: + falp_22bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 23: + falp_23bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 24: + falp_24bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 25: + falp_25bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 26: + falp_26bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 27: + falp_27bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 28: + falp_28bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 29: + falp_29bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 30: + falp_30bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 31: + falp_31bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 32: + falp_32bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 33: + falp_33bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 34: + falp_34bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 35: + falp_35bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 36: + falp_36bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 37: + falp_37bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 38: + falp_38bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 39: + falp_39bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 40: + falp_40bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 41: + falp_41bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 42: + falp_42bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 43: + falp_43bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 44: + falp_44bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 45: + falp_45bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 46: + falp_46bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 47: + falp_47bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 48: + falp_48bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 49: + falp_49bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 50: + falp_50bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 51: + falp_51bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 52: + falp_52bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 53: + falp_53bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 54: + falp_54bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 55: + falp_55bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 56: + falp_56bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 57: + falp_57bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 58: + falp_58bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 59: + falp_59bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 60: + falp_60bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 61: + falp_61bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 62: + falp_62bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 63: + falp_63bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 64: + falp_64bw_64ow_128crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + } + } + } + } +} +; diff --git a/generated/arm64v8/sve_intrinsic_uf1/arm64v8_sve_intrinsic_1024_uf1_falp_test.cpp b/generated/arm64v8/sve_intrinsic_uf1/arm64v8_sve_intrinsic_1024_uf1_falp_test.cpp new file mode 100644 index 0000000..132397a --- /dev/null +++ b/generated/arm64v8/sve_intrinsic_uf1/arm64v8_sve_intrinsic_1024_uf1_falp_test.cpp @@ -0,0 +1,119 @@ +#include "alp/alp.hpp" +#include "datasets.hpp" +#include "alp/ffor.hpp" +#include "alp/unffor.hpp" +#include "gtest/gtest.h" +class arm64v8_sve_intrinsic_1024_uf1_falp: public ::testing::Test +{ + public: + double * dbl_arr; + double * exc_arr; + uint16_t * pos_arr; + uint16_t * exc_c_arr; + int64_t * ffor_arr; + int64_t * unffor_arr; + int64_t * base_arr; + int64_t * dig_arr; + double * dec_dbl_arr; + uint8_t bw; + uint8_t factor; + uint8_t exponent; + double * smp_arr; + void SetUp() override + { + dbl_arr = new double[1024]; + exc_arr = new double[1024]; + pos_arr = new uint16_t[1024]; + dig_arr = new int64_t[1024]; + dec_dbl_arr = new double[1024]; + exc_c_arr = new uint16_t[1024]; + ffor_arr = new int64_t[1024]; + unffor_arr = new int64_t[1024]; + base_arr = new int64_t[1024]; + smp_arr = new double[ALP_VECTOR_SIZE]; + } + ~arm64v8_sve_intrinsic_1024_uf1_falp () override + { + delete[] dbl_arr; + delete[] exc_arr; + delete[] pos_arr; + delete[] dig_arr; + delete[] dec_dbl_arr; + delete[] exc_c_arr; + delete[] ffor_arr; + delete[] unffor_arr; + delete[] base_arr; + delete[] smp_arr; + } +} +; +TEST_F(arm64v8_sve_intrinsic_1024_uf1_falp, fused) +{ + for (auto & dataset : alp_bench::datasets) + { + std:: ifstream ifile(dataset.sample_csv_file_path, std::ios:: in ); + ASSERT_EQ(ifile.fail(), false); + alp::state stt; + if (dataset.suitable_for_cutting) { continue; } + if (dataset.name.find("bw") != std::string::npos) { continue; } + double num = 0.0; + size_t c {0}; + while (ifile >> num) + { + dbl_arr[c] = num; + c = c + 1; + } + // Init + alp::AlpEncode::init(dbl_arr, 0, 1024, smp_arr, stt); + // Encode + alp::AlpEncode::encode(dbl_arr, exc_arr, pos_arr, exc_c_arr, dig_arr, stt); + alp::AlpEncode::analyze_ffor(dig_arr, bw, base_arr); + alp::generated::ffor::fallback::scalar::ffor(dig_arr, ffor_arr, bw, base_arr); + // Decode + generated::falp::arm64v8::sve::falp(reinterpret_cast < uint64_t * > (ffor_arr), dec_dbl_arr, bw, reinterpret_cast < uint64_t * > (base_arr),stt.fac, stt.exp); + alp::AlpDecode::patch_exceptions(dec_dbl_arr, exc_arr, pos_arr, exc_c_arr); + for (size_t i = 0; i < 1024; ++i) + { + ASSERT_EQ(dbl_arr[i], dec_dbl_arr[i]); + } + ASSERT_EQ(dataset.exceptions_count, exc_c_arr[0]); + ASSERT_EQ(dataset.bit_width, bw); + ifile.close(); + } +} + +TEST_F(arm64v8_sve_intrinsic_1024_uf1_falp, unfused) +{ + for (auto & dataset : alp_bench::datasets) + { + std:: ifstream ifile(dataset.sample_csv_file_path, std::ios:: in ); + ASSERT_EQ(ifile.fail(), false); + alp::state stt; + if (dataset.suitable_for_cutting) { continue; } + if (dataset.name.find("bw") != std::string::npos) { continue; } + double num = 0.0; + size_t c {0}; + while (ifile >> num) + { + dbl_arr[c] = num; + c = c + 1; + } + // Init + alp::AlpEncode::init(dbl_arr, 0, 1024, smp_arr, stt); + // Encode + alp::AlpEncode::encode(dbl_arr, exc_arr, pos_arr, exc_c_arr, dig_arr, stt); + alp::AlpEncode::analyze_ffor(dig_arr, bw, base_arr); + alp::generated::ffor::fallback::scalar::ffor(dig_arr, ffor_arr, bw, base_arr); + // Decode + alp::generated::unffor::fallback::scalar::unffor(ffor_arr, unffor_arr, bw, base_arr); + alp::AlpDecode(reinterpret_cast(unffor_arr), stt.fac, stt.exp, dec_dbl_arr); + alp::AlpDecode::patch_exceptions(dec_dbl_arr, exc_arr, pos_arr, exc_c_arr); + for (size_t i = 0; i < 1024; ++i) + { + ASSERT_EQ(dbl_arr[i], dec_dbl_arr[i]); + } + ASSERT_EQ(dataset.exceptions_count, exc_c_arr[0]); + ASSERT_EQ(dataset.bit_width, bw); + ifile.close(); + } +} diff --git a/generated/arm64v8/sve_intrinsic_uf1/falp.cmake b/generated/arm64v8/sve_intrinsic_uf1/falp.cmake new file mode 100644 index 0000000..aa4d16c --- /dev/null +++ b/generated/arm64v8/sve_intrinsic_uf1/falp.cmake @@ -0,0 +1,31 @@ +add_library(arm64v8_sve_intrinsic_1024_uf1_falp OBJECT + arm64v8_sve_intrinsic_1024_uf1_falp_src.cpp) +target_compile_definitions(arm64v8_sve_intrinsic_1024_uf1_falp PRIVATE IS_SCALAR) +set(FLAG -O3) +check_cxx_compiler_flag(${FLAG} HAS_FLAG) +if(HAS_FLAG) +else() + message(STATUS "The flag ${FLAG} is not supported by the current compiler") +endif() +target_compile_options(arm64v8_sve_intrinsic_1024_uf1_falp PUBLIC ${FLAG}) +cmake_print_properties(TARGETS arm64v8_sve_intrinsic_1024_uf1_falp + PROPERTIES COMPILE_DEFINITIONS + PROPERTIES COMPILE_OPTIONS) +LIST (APPEND ALP_GENERATED_OBJECT_FILES + $) +get_target_property(TARGET_NAME arm64v8_sve_intrinsic_1024_uf1_falp NAME) +get_target_property(TARGET_COMPILE_OPTIONS arm64v8_sve_intrinsic_1024_uf1_falp COMPILE_OPTIONS) +#------------------------------------------------------------------------------------------------------ +add_executable(arm64v8_sve_intrinsic_1024_uf1_falp_test arm64v8_sve_intrinsic_1024_uf1_falp_test.cpp) +target_link_libraries(arm64v8_sve_intrinsic_1024_uf1_falp_test PRIVATE arm64v8_sve_intrinsic_1024_uf1_falp) +target_link_libraries(arm64v8_sve_intrinsic_1024_uf1_falp_test PRIVATE alp_ffor) +target_link_libraries(arm64v8_sve_intrinsic_1024_uf1_falp_test PRIVATE gtest_main) +target_include_directories(arm64v8_sve_intrinsic_1024_uf1_falp_test PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) +gtest_discover_tests(arm64v8_sve_intrinsic_1024_uf1_falp_test) +#------------------------------------------------------------------------------------------------------ +configure_file(${CMAKE_SOURCE_DIR}/alp_bench/alp_bench.hpp ${CMAKE_CURRENT_BINARY_DIR}/arm64v8_sve_intrinsic_1024_uf1_falp_bench.hpp) +add_executable(arm64v8_sve_intrinsic_1024_uf1_falp_bench arm64v8_sve_intrinsic_1024_uf1_falp_bench.cpp) +target_link_libraries(arm64v8_sve_intrinsic_1024_uf1_falp_bench PRIVATE arm64v8_sve_intrinsic_1024_uf1_falp) +target_link_libraries(arm64v8_sve_intrinsic_1024_uf1_falp_bench PRIVATE alp_ffor) +target_include_directories(arm64v8_sve_intrinsic_1024_uf1_falp_bench PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) +add_alp_benchmark(arm64v8_sve_intrinsic_1024_uf1_falp_bench) diff --git a/generated/fallback/CMakeLists.txt b/generated/fallback/CMakeLists.txt new file mode 100644 index 0000000..3955135 --- /dev/null +++ b/generated/fallback/CMakeLists.txt @@ -0,0 +1,11 @@ +#add_subdirectory(scalar_av_uf1) +add_subdirectory(scalar_aav_uf1) +#add_subdirectory(scalar_nav_uf1) + +add_library(generated_fallback + OBJECT + fallback.cpp) + +set(ALP_GENERATED_OBJECT_FILES + ${ALP_GENERATED_OBJECT_FILES} $ + PARENT_SCOPE) diff --git a/generated/fallback/fallback.cpp b/generated/fallback/fallback.cpp new file mode 100644 index 0000000..e69de29 diff --git a/generated/fallback/scalar_aav_uf1/CMakeLists.txt b/generated/fallback/scalar_aav_uf1/CMakeLists.txt new file mode 100644 index 0000000..2b9ff8a --- /dev/null +++ b/generated/fallback/scalar_aav_uf1/CMakeLists.txt @@ -0,0 +1,50 @@ +if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/unpack.cmake") + include(${CMAKE_CURRENT_SOURCE_DIR}/unpack.cmake) +else() +endif() +if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/ut.cmake") + include(${CMAKE_CURRENT_SOURCE_DIR}/ut.cmake) +else() +endif() +if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/normal.cmake") + include(${CMAKE_CURRENT_SOURCE_DIR}/normal.cmake) +else() +endif() +if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/pack.cmake") + include(${CMAKE_CURRENT_SOURCE_DIR}/pack.cmake) +else() +endif() +if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/unffor.cmake") + include(${CMAKE_CURRENT_SOURCE_DIR}/unffor.cmake) +else() +endif() +if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/ffor.cmake") + include(${CMAKE_CURRENT_SOURCE_DIR}/ffor.cmake) +else() +endif() +if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/unrsum.cmake") + include(${CMAKE_CURRENT_SOURCE_DIR}/unrsum.cmake) +else() +endif() +if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/rsum.cmake") + include(${CMAKE_CURRENT_SOURCE_DIR}/rsum.cmake) +else() +endif() +if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/untranspose.cmake") + include(${CMAKE_CURRENT_SOURCE_DIR}/untranspose.cmake) +else() +endif() +if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/transpose.cmake") + include(${CMAKE_CURRENT_SOURCE_DIR}/transpose.cmake) +else() +endif() +if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/rsum_and_untranspose.cmake") + include(${CMAKE_CURRENT_SOURCE_DIR}/rsum_and_untranspose.cmake) +else() +endif() +if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/falp.cmake") + include(${CMAKE_CURRENT_SOURCE_DIR}/falp.cmake) +else() +endif() +set(FLS_GENERATED_OBJECT_FILES + ${FLS_GENERATED_OBJECT_FILES} PARENT_SCOPE) diff --git a/generated/fallback/scalar_aav_uf1/fallback_scalar_aav_1024_uf1_falp_src.cpp b/generated/fallback/scalar_aav_uf1/fallback_scalar_aav_1024_uf1_falp_src.cpp new file mode 100644 index 0000000..ce890af --- /dev/null +++ b/generated/fallback/scalar_aav_uf1/fallback_scalar_aav_1024_uf1_falp_src.cpp @@ -0,0 +1,33954 @@ +#include "alp/alp.hpp" +#include "fastlanes/macros.hpp" +namespace generated { namespace falp::fallback { namespace scalar { +static void falp_0bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + *(out + (i * 1) + (0 * 16) + (16 * 0)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = base_0; + } +} +static void falp_1bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_2bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 = (register_0) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_3bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_4bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 = (register_0) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 = (register_0) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 = (register_0) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_5bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_6bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 = (register_0) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_7bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_8bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 = (register_0) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 = (register_0) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 = (register_0) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 = (register_0) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 = (register_0) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 = (register_0) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 = (register_0) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_9bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_10bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 = (register_0) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_11bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_12bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 = (register_0) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 = (register_0) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 = (register_0) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_13bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_14bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 = (register_0) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_15bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 13; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_16bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_17bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 13; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 15; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_18bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 = (register_0) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_19bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 13; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 15; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 17; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_20bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 = (register_0) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 = (register_0) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 = (register_0) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_21bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 13; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 15; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 17; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 19; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_22bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 = (register_0) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_23bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 13; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 21; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 19; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 17; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 15; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_24bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 = (register_0) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 = (register_0) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 = (register_0) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 = (register_0) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 = (register_0) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 = (register_0) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 = (register_0) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_25bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 17; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 23; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 15; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 21; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 13; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 19; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_26bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 = (register_0) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_27bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 13; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 23; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 19; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 15; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 25; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 21; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 17; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_28bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 = (register_0) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 = (register_0) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 = (register_0) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_29bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 13; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 19; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 25; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 15; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 21; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 27; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 17; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 23; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_30bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 = (register_0) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_31bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 29) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 13; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 15; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 17; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 19; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 21; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 23; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 25; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 27; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 29) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 29; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_32bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_33bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 31) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 31; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 29) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 29; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 27; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 25; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 23; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 21; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 19; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 17; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 15; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 13; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 29) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 31) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_34bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 = (register_0) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_35bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 29) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 29; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 23; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 17; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 31) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 33) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 33; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 27; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 21; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 15; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 33) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 31) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 31; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 25; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 19; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 13; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 29) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_36bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 = (register_0) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 = (register_0) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 = (register_0) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_37bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 27; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 17; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 33) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 31) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 31; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 21; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 29) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 35) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 35; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 25; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 15; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 35) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 29) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 29; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 19; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 31) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 33) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 33; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 23; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 13; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_38bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 = (register_0) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_39bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 25; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 31) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 33) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 33; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 19; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 37) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 27; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 13; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 38; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 29) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 35) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 35; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 21; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 35) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 29) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 29; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 15; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 37) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 37; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 23; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 33) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 31) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 31; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 17; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_40bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 = (register_0) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 = (register_0) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 = (register_0) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 = (register_0) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 = (register_0) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 = (register_0) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 = (register_0) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_41bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 41) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 23; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 41) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 41) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 31) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 33) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 33; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 41) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 15; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 38; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 41) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 41) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 39) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 25; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 41) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 41) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 29) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 35) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 35; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 41) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 17; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 41) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 41) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 37) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 27; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 41) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 41) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 37) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 37; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 41) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 19; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 41) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 41) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 35) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 29) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 29; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 41) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 41) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 39) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 39; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 41) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 21; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 41) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 41) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 33) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 31) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 31; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 41) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 13; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 41) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 41) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_42bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 42) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 42) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 42) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 42) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 42) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 42) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 42) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 42) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 42) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 38; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 42) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 42) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 = (register_0) & ((1ULL << 42) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 42) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 42) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 42) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 42) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 42) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 42) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 42) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 42) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 38; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 42) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 42) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_43bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 43) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 21; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 42; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 43) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 41) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 41; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 43) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 19; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 43) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 39) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 39; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 43) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 17; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 38; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 43) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 37) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 37; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 43) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 15; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 43) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 29) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 35) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 35; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 43) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 13; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 43) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 31) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 33) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 33; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 43) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 43) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 33) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 31) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 31; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 43) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 43) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 35) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 29) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 29; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 43) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 43) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 37) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 27; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 43) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 43) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 39) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 25; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 43) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 43) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 41) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 23; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 43) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 43) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_44bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 44) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 44) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 44) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 44) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 44) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 = (register_0) & ((1ULL << 44) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 44) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 44) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 44) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 44) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 = (register_0) & ((1ULL << 44) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 44) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 44) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 44) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 44) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 = (register_0) & ((1ULL << 44) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 44) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 44) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 44) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 44) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_45bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 45) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 19; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 38; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 45) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 33) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 31) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 31; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 45) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 43) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 43; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 45) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 17; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 45) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 35) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 29) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 29; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 45) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 41) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 41; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 45) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 15; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 45) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 37) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 27; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 45) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 39) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 39; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 45) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 13; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 45) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 39) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 25; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 45) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 37) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 37; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 45) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 45) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 41) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 23; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 42; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 45) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 29) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 35) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 35; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 45) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 45) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 43) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 21; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 45) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 31) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 33) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 33; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 45) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 45) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_46bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 46) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 46) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 46) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 46) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 42; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 46) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 46) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 46) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 46) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 38; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 46) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 = (register_0) & ((1ULL << 46) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 46) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 46) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 46) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 42; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 46) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 46) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 46) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 46) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 38; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 46) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 720); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_47bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 47) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 17; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 47) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 43) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 21; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 38; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 47) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 39) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 25; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 42; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 47) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 35) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 29) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 29; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 46; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 47) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 31) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 33) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 33; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 47) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 37) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 37; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 47) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 41) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 41; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 47) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 45) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 45; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 47) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 15; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 47) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 45) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 19; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 47) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 41) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 23; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 47) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 37) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 27; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 47) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 33) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 31) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 31; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 47) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 46) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 29) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 35) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 35; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 47) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 39) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 39; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 47) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 43) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 43; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 47) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 720); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 13; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 47) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_48bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 = (register_0) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 = (register_0) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 = (register_0) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 = (register_0) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 = (register_0) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 = (register_0) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 = (register_0) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 = (register_0) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 = (register_0) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 = (register_0) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 = (register_0) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 = (register_0) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 = (register_0) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 = (register_0) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 720); + tmp_0 = (register_0) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 752); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_49bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 49) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 15; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 45) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 45; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 49) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 41) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 41; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 49) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 37) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 37; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 49) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 46) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 31) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 33) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 33; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 49) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 35) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 29) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 29; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 49) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 39) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 25; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 49) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 43) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 21; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 49) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 47) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 17; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 47) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 47; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 49) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 13; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 43) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 43; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 49) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 39) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 39; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 49) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 29) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 35) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 35; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 49) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 33) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 31) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 31; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 46; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 49) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 37) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 27; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 42; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 49) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 41) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 23; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 720); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 38; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 49) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 45) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 752); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 19; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 768); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 49) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_50bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 50) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 42; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 50) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 50) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 50) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 46) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 46; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 50) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 38; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 50) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 50) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 50) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 = (register_0) & ((1ULL << 50) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 42; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 50) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 50) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 50) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 46) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 46; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 50) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 38; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 50) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 720); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 50) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 752); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 768); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 784); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 50) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_51bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 51) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 13; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 39) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 39; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 51) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 50) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 37) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 27; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 51) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 49) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 15; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 41) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 41; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 51) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 35) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 29) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 29; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 42; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 51) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 47) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 17; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 43) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 43; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 51) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 46) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 33) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 31) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 31; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 51) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 45) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 19; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 45) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 45; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 51) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 31) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 33) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 33; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 46; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 51) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 43) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 21; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 47) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 47; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 51) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 29) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 35) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 35; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 51) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 41) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 23; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 49) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 49; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 51) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 720); + tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 37) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 37; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 50) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 752); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 50; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 51) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 768); + tmp_0 |= ((register_0) & ((1ULL << 39) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 784); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 25; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 800); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 38; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 51) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_52bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 52) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 52) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 52) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 = (register_0) & ((1ULL << 52) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 52) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 52) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 = (register_0) & ((1ULL << 52) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 52) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 52) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 = (register_0) & ((1ULL << 52) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 52) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 720); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 752); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 52) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 768); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 784); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 800); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 816); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_53bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 53) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 31) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 33) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 33; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 53) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 51) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 13; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 29) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 35) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 35; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 46; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 53) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 49) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 15; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 37) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 37; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 53) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 47) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 17; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 39) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 39; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 50) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 50; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 53) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 45) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 19; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 41) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 41; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 52; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 53) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 43) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 21; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 43) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 43; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 53) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 52) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 41) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 23; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 45) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 45; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 53) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 50) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 39) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 25; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 47) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 47; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 53) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 37) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 27; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 38; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 49) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 49; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 53) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 46) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 720); + tmp_0 |= ((register_0) & ((1ULL << 35) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 29) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 29; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 752); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 51) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 768); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 51; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 53) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 784); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 800); + tmp_0 |= ((register_0) & ((1ULL << 33) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 31) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 816); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 31; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 832); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 42; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 53) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_54bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 54) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 50) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 50; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 54) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 46; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 54) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 52) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 42; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 52; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 54) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 46) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 38; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 54) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 50) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 54) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 = (register_0) & ((1ULL << 54) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 50) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 50; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 54) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 46; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 54) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 52) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 42; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 52; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 54) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 46) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 720); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 752); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 38; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 768); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 54) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 784); + tmp_0 |= ((register_0) & ((1ULL << 50) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 800); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 816); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 832); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 848); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 54) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_55bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 55) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 46) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 37) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 27; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 45) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 45; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 54) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 54; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 55) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 47) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 17; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 29) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 35) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 35; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 53) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 53; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 55) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 39) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 25; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 43) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 43; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 52; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 55) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 49) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 15; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 31) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 33) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 33; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 42; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 51) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 51; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 55) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 50) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 41) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 23; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 41) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 41; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 50) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 50; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 55) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 51) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 13; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 33) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 31) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 31; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 49) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 49; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 55) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 52) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 43) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 21; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 39) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 39; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 55) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 53) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 720); + tmp_0 |= ((register_0) & ((1ULL << 35) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 29) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 29; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 752); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 38; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 47) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 768); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 47; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 55) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 784); + tmp_0 |= ((register_0) & ((1ULL << 54) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 800); + tmp_0 |= ((register_0) & ((1ULL << 45) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 816); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 19; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 832); + tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 37) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 848); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 37; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 864); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 46; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 55) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_56bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 56) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 = (register_0) & ((1ULL << 56) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 = (register_0) & ((1ULL << 56) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 = (register_0) & ((1ULL << 56) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 = (register_0) & ((1ULL << 56) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 = (register_0) & ((1ULL << 56) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 = (register_0) & ((1ULL << 56) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 720); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 752); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 768); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 784); + tmp_0 = (register_0) & ((1ULL << 56) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 800); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 816); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 832); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 848); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 864); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 880); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_57bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 57) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 50) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 43) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 21; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 29) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 35) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 35; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 42; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 49) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 49; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 56; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 57) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 51) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 13; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 37) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 27; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 41) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 41; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 55) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 55; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 57) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 52) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 45) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 19; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 31) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 33) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 33; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 47) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 47; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 54) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 54; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 57) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 53) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 46) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 39) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 25; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 39) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 39; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 46; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 53) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 53; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 57) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 54) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 47) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 17; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 33) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 31) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 31; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 38; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 45) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 45; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 52; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 57) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 55) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 41) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 23; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 720); + tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 37) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 37; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 752); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 51) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 768); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 51; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 57) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 784); + tmp_0 |= ((register_0) & ((1ULL << 56) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 800); + tmp_0 |= ((register_0) & ((1ULL << 49) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 816); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 15; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 832); + tmp_0 |= ((register_0) & ((1ULL << 35) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 29) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 848); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 29; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 864); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 43) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 880); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 43; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 50) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 896); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 50; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 57) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_58bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 58) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 52) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 46) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 42; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 54) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 54; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 58) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 56) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 50) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 38; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 50) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 50; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 56; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 58) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 54) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 46; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 52; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 58) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 = (register_0) & ((1ULL << 58) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 52) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 46) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 42; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 54) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 54; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 58) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 56) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 50) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 720); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 38; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 50) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 752); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 50; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 768); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 56; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 58) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 784); + tmp_0 |= ((register_0) & ((1ULL << 54) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 800); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 816); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 832); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 848); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 864); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 880); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 896); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 46; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 912); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 52; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 58) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_59bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 59) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 54) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 49) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 15; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 39) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 25; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 29) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 35) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 35; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 45) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 45; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 50) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 50; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 55) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 55; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 59) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 58) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 53) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 43) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 21; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 33) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 31) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 31; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 41) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 41; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 46; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 51) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 51; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 56; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 59) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 57) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 52) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 47) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 17; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 37) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 27; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 37) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 37; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 42; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 47) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 47; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 52; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 57) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 57; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 59) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 56) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 51) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 46) - 1)) << 13; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 41) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 23; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 31) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 33) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 33; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 38; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 43) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 43; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 720); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 53) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 53; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 58) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 752); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 58; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 59) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 768); + tmp_0 |= ((register_0) & ((1ULL << 55) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 784); + tmp_0 |= ((register_0) & ((1ULL << 50) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 800); + tmp_0 |= ((register_0) & ((1ULL << 45) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 816); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 19; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 832); + tmp_0 |= ((register_0) & ((1ULL << 35) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 29) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 848); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 29; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 864); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 39) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 880); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 39; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 896); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 49) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 912); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 49; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 54) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 928); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 54; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 59) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_60bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 60) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 56) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 52) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 52; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 56; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 60) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 = (register_0) & ((1ULL << 60) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 56) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 52) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 52; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 56; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 60) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 = (register_0) & ((1ULL << 60) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 56) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 52) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 52; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 56; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 60) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 720); + tmp_0 = (register_0) & ((1ULL << 60) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 56) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 752); + tmp_0 |= ((register_0) & ((1ULL << 52) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 768); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 784); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 800); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 816); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 832); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 848); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 864); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 880); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 896); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 912); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 928); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 52; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 944); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 56; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 60) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_61bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 61) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 58) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 55) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 52) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 49) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 46) - 1)) << 15; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 43) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 21; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 37) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 27; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 31) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 33) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 33; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 39) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 39; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 42; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 45) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 45; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 51) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 51; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 54) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 54; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 57) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 57; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 60) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 60; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 61) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 59) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 56) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 53) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 50) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 47) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 17; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 41) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 23; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 35) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 29) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 29; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 29) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 35) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 35; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 38; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 41) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 41; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 47) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 47; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 50) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 50; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 53) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 53; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 56; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 59) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 59; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 61) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 60) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 57) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 54) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 51) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 720); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 13; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 45) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 752); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 19; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 768); + tmp_0 |= ((register_0) & ((1ULL << 39) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 784); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 25; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 800); + tmp_0 |= ((register_0) & ((1ULL << 33) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 31) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 816); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 31; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 832); + tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 37) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 848); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 37; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 864); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 43) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 880); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 43; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 896); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 46; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 49) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 912); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 49; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 928); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 52; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 55) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 944); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 55; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 58) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 960); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 58; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 61) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_62bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 62) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 60) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 58) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 56) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 54) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 52) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 50) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 46) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 38; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 42; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 46; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 50) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 50; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 52; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 54) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 54; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 56; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 58) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 58; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 60) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 60; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 62) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 = (register_0) & ((1ULL << 62) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 60) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 58) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 56) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 54) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 52) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 50) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 46) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 720); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 752); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 768); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 784); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 800); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 38; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 816); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 832); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 42; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 848); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 864); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 46; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 880); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 50) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 896); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 50; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 912); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 52; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 54) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 928); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 54; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 944); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 56; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 58) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 960); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 58; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 60) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 976); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 60; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 62) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_63bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 63) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 62) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 61) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 60) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 59) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 58) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 57) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 56) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 55) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 54) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 53) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 52) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 51) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 50) - 1)) << 13; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 49) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 15; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 47) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 46) - 1)) << 17; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 45) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 19; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 43) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 21; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 41) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 23; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 39) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 25; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 37) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 27; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 35) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 29) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 29; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 33) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 31) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 31; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 31) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 33) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 33; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 29) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 35) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 35; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 37) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 37; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 38; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 39) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 39; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 41) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 41; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 42; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 43) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 43; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 45) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 720); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 45; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 46; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 47) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 752); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 47; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 768); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 49) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 784); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 49; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 50) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 800); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 50; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 51) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 816); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 51; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 832); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 52; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 53) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 848); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 53; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 54) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 864); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 54; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 55) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 880); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 55; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 896); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 56; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 57) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 912); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 57; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 58) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 928); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 58; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 59) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 944); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 59; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 60) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 960); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 60; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 61) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 976); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 61; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 62) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 992); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 62; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 63) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_64bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 16); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 32); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 48); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 64); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 80); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 96); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 112); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 128); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 144); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 160); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 176); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 192); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 208); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 224); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 240); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 256); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 272); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 288); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 304); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 320); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 336); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 352); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 368); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 384); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 400); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 416); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 432); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 448); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 464); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 480); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 496); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 512); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 528); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 544); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 560); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 576); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 592); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 608); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 624); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 640); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 656); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 672); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 688); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 704); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 720); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 736); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 752); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 768); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 784); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 800); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 816); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 832); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 848); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 864); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 880); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 896); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 912); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 928); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 944); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 960); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 976); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 992); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 1008); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = register_0; + } +} +void falp(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + uint8_t bw, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + switch (bw) { + case 0: + falp_0bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 1: + falp_1bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 2: + falp_2bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 3: + falp_3bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 4: + falp_4bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 5: + falp_5bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 6: + falp_6bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 7: + falp_7bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 8: + falp_8bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 9: + falp_9bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 10: + falp_10bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 11: + falp_11bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 12: + falp_12bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 13: + falp_13bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 14: + falp_14bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 15: + falp_15bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 16: + falp_16bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 17: + falp_17bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 18: + falp_18bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 19: + falp_19bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 20: + falp_20bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 21: + falp_21bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 22: + falp_22bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 23: + falp_23bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 24: + falp_24bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 25: + falp_25bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 26: + falp_26bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 27: + falp_27bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 28: + falp_28bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 29: + falp_29bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 30: + falp_30bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 31: + falp_31bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 32: + falp_32bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 33: + falp_33bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 34: + falp_34bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 35: + falp_35bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 36: + falp_36bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 37: + falp_37bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 38: + falp_38bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 39: + falp_39bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 40: + falp_40bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 41: + falp_41bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 42: + falp_42bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 43: + falp_43bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 44: + falp_44bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 45: + falp_45bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 46: + falp_46bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 47: + falp_47bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 48: + falp_48bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 49: + falp_49bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 50: + falp_50bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 51: + falp_51bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 52: + falp_52bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 53: + falp_53bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 54: + falp_54bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 55: + falp_55bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 56: + falp_56bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 57: + falp_57bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 58: + falp_58bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 59: + falp_59bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 60: + falp_60bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 61: + falp_61bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 62: + falp_62bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 63: + falp_63bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 64: + falp_64bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + } +} +}}}; // namespace generated::falp::fallback::scalar diff --git a/generated/fallback/scalar_aav_uf1/fallback_scalar_aav_1024_uf1_falp_test.cpp b/generated/fallback/scalar_aav_uf1/fallback_scalar_aav_1024_uf1_falp_test.cpp new file mode 100644 index 0000000..86ae92b --- /dev/null +++ b/generated/fallback/scalar_aav_uf1/fallback_scalar_aav_1024_uf1_falp_test.cpp @@ -0,0 +1,112 @@ +#include "alp.hpp" +#include "data.hpp" +#include "gtest/gtest.h" +#include + +class fallback_scalar_aav_1024_uf1_falp : public ::testing::Test { +public: + double* dbl_arr; + double* exc_arr; + uint16_t* pos_arr; + uint16_t* exc_c_arr; + int64_t* ffor_arr; + int64_t* unffor_arr; + int64_t* base_arr; + int64_t* dig_arr; + double* dec_dbl_arr; + uint8_t bw; + uint8_t factor; + uint8_t exponent; + double* smp_arr; + void SetUp() override { + dbl_arr = new double[1024]; + exc_arr = new double[1024]; + pos_arr = new uint16_t[1024]; + dig_arr = new int64_t[1024]; + dec_dbl_arr = new double[1024]; + exc_c_arr = new uint16_t[1024]; + ffor_arr = new int64_t[1024]; + unffor_arr = new int64_t[1024]; + base_arr = new int64_t[1024]; + smp_arr = new double[1024]; + } + ~fallback_scalar_aav_1024_uf1_falp() override { + delete[] dbl_arr; + delete[] exc_arr; + delete[] pos_arr; + delete[] dig_arr; + delete[] dec_dbl_arr; + delete[] exc_c_arr; + delete[] ffor_arr; + delete[] unffor_arr; + delete[] base_arr; + delete[] smp_arr; + } +}; +TEST_F(fallback_scalar_aav_1024_uf1_falp, fused) { + for (auto& dataset : alp_bench::alp_dataset) { + std::ifstream ifile(dataset.sample_csv_file_path, std::ios::in); + ASSERT_EQ(ifile.fail(), false); + alp::state stt; + if (dataset.suitable_for_cutting) { continue; } + if (dataset.name.find("bw") != std::string::npos) { continue; } + double num = 0.0; + size_t c {0}; + while (ifile >> num) { + dbl_arr[c] = num; + c = c + 1; + } + // Init + alp::AlpEncode::init(dbl_arr, 0, 1024, smp_arr, stt); + // Encode + alp::AlpEncode::encode(dbl_arr, exc_arr, pos_arr, exc_c_arr, dig_arr, stt); + alp::AlpEncode::analyze_ffor(dig_arr, bw, base_arr); + ffor::ffor(dig_arr, ffor_arr, bw, base_arr); + // Decode + generated::falp::fallback::scalar::falp(reinterpret_cast(ffor_arr), + dec_dbl_arr, + bw, + reinterpret_cast(base_arr), + stt.fac, + stt.exp); + alp::AlpDecode::patch_exceptions(dec_dbl_arr, exc_arr, pos_arr, exc_c_arr); + for (size_t i = 0; i < 1024; ++i) { + ASSERT_EQ(dbl_arr[i], dec_dbl_arr[i]); + } + ASSERT_EQ(dataset.exceptions_count, exc_c_arr[0]); + ASSERT_EQ(dataset.bit_width, bw); + ifile.close(); + } +} + +TEST_F(fallback_scalar_aav_1024_uf1_falp, unfused) { + for (auto& dataset : alp_bench::alp_dataset) { + std::ifstream ifile(dataset.sample_csv_file_path, std::ios::in); + ASSERT_EQ(ifile.fail(), false); + alp::state stt; + if (dataset.suitable_for_cutting) { continue; } + if (dataset.name.find("bw") != std::string::npos) { continue; } + double num = 0.0; + size_t c {0}; + while (ifile >> num) { + dbl_arr[c] = num; + c = c + 1; + } + // Init + alp::AlpEncode::init(dbl_arr, 0, 1024, smp_arr, stt); + // Encode + alp::AlpEncode::encode(dbl_arr, exc_arr, pos_arr, exc_c_arr, dig_arr, stt); + alp::AlpEncode::analyze_ffor(dig_arr, bw, base_arr); + ffor::ffor(dig_arr, ffor_arr, bw, base_arr); + // Decode + unffor::unffor(ffor_arr, unffor_arr, bw, base_arr); + alp::AlpDecode::decode(unffor_arr, stt.fac, stt.exp, dec_dbl_arr); + alp::AlpDecode::patch_exceptions(dec_dbl_arr, exc_arr, pos_arr, exc_c_arr); + for (size_t i = 0; i < 1024; ++i) { + ASSERT_EQ(dbl_arr[i], dec_dbl_arr[i]); + } + ASSERT_EQ(dataset.exceptions_count, exc_c_arr[0]); + ASSERT_EQ(dataset.bit_width, bw); + ifile.close(); + } +} diff --git a/generated/fallback/scalar_aav_uf1/falp.cmake b/generated/fallback/scalar_aav_uf1/falp.cmake new file mode 100644 index 0000000..6ac60e8 --- /dev/null +++ b/generated/fallback/scalar_aav_uf1/falp.cmake @@ -0,0 +1,16 @@ +#------------------------------------------------------------------------------------------------------ +if (ALP_BUILD_TESTING) + add_executable(fallback_scalar_aav_1024_uf1_falp_test fallback_scalar_aav_1024_uf1_falp_test.cpp) + target_link_libraries(fallback_scalar_aav_1024_uf1_falp_test PRIVATE ALP) + target_link_libraries(fallback_scalar_aav_1024_uf1_falp_test PRIVATE gtest_main) + target_include_directories(fallback_scalar_aav_1024_uf1_falp_test PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) + gtest_discover_tests(fallback_scalar_aav_1024_uf1_falp_test) +endif () +#------------------------------------------------------------------------------------------------------ +if (ALP_BUILD_BENCHMARK) + configure_file(${CMAKE_SOURCE_DIR}/alp_bench/alp_bench.hpp ${CMAKE_CURRENT_BINARY_DIR}/fallback_scalar_aav_1024_uf1_falp_bench.hpp) + add_executable(fallback_scalar_aav_1024_uf1_falp_bench fallback_scalar_aav_1024_uf1_falp_bench.cpp) + target_link_libraries(fallback_scalar_aav_1024_uf1_falp_bench PRIVATE ALP) + target_include_directories(fallback_scalar_aav_1024_uf1_falp_bench PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) + add_alp_benchmark(fallback_scalar_aav_1024_uf1_falp_bench) +endif () diff --git a/generated/fallback/scalar_nav_uf1/CMakeLists.txt b/generated/fallback/scalar_nav_uf1/CMakeLists.txt new file mode 100644 index 0000000..2b9ff8a --- /dev/null +++ b/generated/fallback/scalar_nav_uf1/CMakeLists.txt @@ -0,0 +1,50 @@ +if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/unpack.cmake") + include(${CMAKE_CURRENT_SOURCE_DIR}/unpack.cmake) +else() +endif() +if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/ut.cmake") + include(${CMAKE_CURRENT_SOURCE_DIR}/ut.cmake) +else() +endif() +if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/normal.cmake") + include(${CMAKE_CURRENT_SOURCE_DIR}/normal.cmake) +else() +endif() +if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/pack.cmake") + include(${CMAKE_CURRENT_SOURCE_DIR}/pack.cmake) +else() +endif() +if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/unffor.cmake") + include(${CMAKE_CURRENT_SOURCE_DIR}/unffor.cmake) +else() +endif() +if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/ffor.cmake") + include(${CMAKE_CURRENT_SOURCE_DIR}/ffor.cmake) +else() +endif() +if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/unrsum.cmake") + include(${CMAKE_CURRENT_SOURCE_DIR}/unrsum.cmake) +else() +endif() +if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/rsum.cmake") + include(${CMAKE_CURRENT_SOURCE_DIR}/rsum.cmake) +else() +endif() +if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/untranspose.cmake") + include(${CMAKE_CURRENT_SOURCE_DIR}/untranspose.cmake) +else() +endif() +if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/transpose.cmake") + include(${CMAKE_CURRENT_SOURCE_DIR}/transpose.cmake) +else() +endif() +if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/rsum_and_untranspose.cmake") + include(${CMAKE_CURRENT_SOURCE_DIR}/rsum_and_untranspose.cmake) +else() +endif() +if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/falp.cmake") + include(${CMAKE_CURRENT_SOURCE_DIR}/falp.cmake) +else() +endif() +set(FLS_GENERATED_OBJECT_FILES + ${FLS_GENERATED_OBJECT_FILES} PARENT_SCOPE) diff --git a/generated/fallback/scalar_nav_uf1/fallback_scalar_nav_1024_uf1_falp_bench.cpp b/generated/fallback/scalar_nav_uf1/fallback_scalar_nav_1024_uf1_falp_bench.cpp new file mode 100644 index 0000000..6f17cc2 --- /dev/null +++ b/generated/fallback/scalar_nav_uf1/fallback_scalar_nav_1024_uf1_falp_bench.cpp @@ -0,0 +1,128 @@ +#include "fallback_scalar_nav_1024_uf1_falp_bench.hpp" +#include "alp/alp.hpp" +#include "datasets.hpp" +#include "alp/ffor.hpp" +#include "alp/unffor.hpp" +static __attribute__((noinline)) benchmark::BenchmarkReporter::Run bench_alp_fused_decode(alp_bench::Dataset& dataset, int64_t* ffor_arr, uint8_t bw, int64_t*base_arr,uint8_t factor,uint8_t exponent,double* dec_dbl_arr,double* exc_arr,uint16_t* pos_arr,uint16_t* exc_c_arr) +{ + int benchmark_number = dataset.id; + + #ifdef NDEBUG + uint64_t iterations = 3000000; + #else + uint64_t iterations = 1; + #endif + + std::string benchmark_name = dataset.name + "_fused"; + + uint64_t cycles = benchmark::cycleclock::Now(); + for (uint64_t i = 0; i < iterations; ++i) { + generated::falp::fallback::scalar::falp(reinterpret_cast(ffor_arr), + dec_dbl_arr, + bw, + reinterpret_cast(base_arr), + factor, + exponent); + alp::AlpDecode::patch_exceptions(dec_dbl_arr, exc_arr, pos_arr, exc_c_arr); + } + + cycles = benchmark::cycleclock::Now() - cycles; + + return benchmark::BenchmarkReporter::Run( + benchmark_number, benchmark_name, iterations, double(cycles) / (double(iterations) * 1024)); +} +static __attribute__((noinline)) benchmark::BenchmarkReporter::Run bench_alp_decode(alp_bench::Dataset& dataset, int64_t* ffor_arr, int64_t* unffor_arr, uint8_t bw, int64_t* base_arr, uint8_t factor, uint8_t exponent, double* dec_dbl_arr, double* exc_arr, uint16_t* pos_arr, uint16_t* exc_c_arr) +{ + + int benchmark_number = dataset.id; + + #ifdef NDEBUG + uint64_t iterations = 3000000; + #else + uint64_t iterations = 1; + #endif + + std::string benchmark_name = dataset.name + ""; + + uint64_t cycles = benchmark::cycleclock::Now(); + for (uint64_t i = 0; i < iterations; ++i) { + alp::generated::unffor::fallback::scalar::unffor(ffor_arr, unffor_arr, bw, base_arr); + alp::AlpDecode(reinterpret_cast(unffor_arr), factor, exponent, dec_dbl_arr); + alp::AlpDecode::patch_exceptions(dec_dbl_arr, exc_arr, pos_arr, exc_c_arr); + } + + cycles = benchmark::cycleclock::Now() - cycles; + + return benchmark::BenchmarkReporter::Run( + benchmark_number, benchmark_name, iterations, double(cycles) / (double(iterations) * 1024)); + +} +void benchmark_all(benchmark::Benchmark& benchmark) +{ + + double* dbl_arr; + double* exc_arr; + uint16_t* pos_arr; + uint16_t* exc_c_arr; + int64_t* ffor_arr; + int64_t* unffor_arr; + + int64_t* base_arr; + int64_t* dig_arr; + double* dec_dbl_arr; + + uint8_t bw; + uint8_t factor; + uint8_t exponent; + + dbl_arr = new (std::align_val_t {64}) double[1024]; + exc_arr = new (std::align_val_t {64}) double[1024]; + pos_arr = new (std::align_val_t {64}) uint16_t[1024]; + dig_arr = new (std::align_val_t {64}) int64_t[1024]; + dec_dbl_arr = new (std::align_val_t {64}) double[1024]; + exc_c_arr = new (std::align_val_t {64}) uint16_t[1024]; + ffor_arr = new (std::align_val_t {64}) int64_t[1024]; + unffor_arr = new (std::align_val_t {64}) int64_t[1024]; + base_arr = new (std::align_val_t {64}) int64_t[1024]; + + for (auto& dataset : alp_bench::datasets) { + std::ifstream ifile(dataset.sample_csv_file_path, std::ios::in); + + // check to see that the file was opened correctly: + if (!ifile.is_open()) { + exit(1); // exit or do additional error checking + } + + double num = 0.0; + // keep storing values from the text file so long as data exists: + size_t c {0}; + while (ifile >> num) { + dbl_arr[c] = num; + c += 1; + } + + factor = dataset.factor; + exponent = dataset.exponent; + + alp::AlpEncode::encode(dbl_arr, exc_arr, pos_arr, exc_c_arr, dig_arr, stt); + alp::AlpEncode::analyze_ffor(dig_arr, bw, base_arr); + alp::generated::ffor::fallback::scalar::ffor(dig_arr, ffor_arr, bw, base_arr); + + benchmark.Run(bench_alp_fused_decode( + dataset, unffor_arr, bw, base_arr, factor, exponent, dec_dbl_arr, exc_arr, pos_arr, exc_c_arr)); + + benchmark.Run(bench_alp_decode( + dataset, ffor_arr, unffor_arr, bw, base_arr, factor, exponent, dec_dbl_arr, exc_arr, pos_arr, exc_c_arr)); + + ifile.close();} +} +int main() +{ + benchmark::Benchmark benchmark = + benchmark::create("fallback_scalar_nav_1024_uf1_falp") + .save() + .at(std::string(SOURCE_DIR) + "/alp_pub/results/" + benchmark::CmakeInfo::getCmakeToolchainFile()) + .print() + .add_extra_info(benchmark::CmakeInfo::getCmakeInfo()); + benchmark_all(benchmark); +} diff --git a/generated/fallback/scalar_nav_uf1/fallback_scalar_nav_1024_uf1_falp_src.cpp b/generated/fallback/scalar_nav_uf1/fallback_scalar_nav_1024_uf1_falp_src.cpp new file mode 100644 index 0000000..5297732 --- /dev/null +++ b/generated/fallback/scalar_nav_uf1/fallback_scalar_nav_1024_uf1_falp_src.cpp @@ -0,0 +1,33764 @@ +#include "alp/alp.hpp" +#include "alp/macros.hpp" +namespace generated +{ + namespace falp::fallback + { + namespace scalar + { + static void falp_0bw_64ow_64crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; + for (int i = 0; i < 16; ++i) + { + *(out + (i * 1) + (0 * 16) + (16 * 0)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = base_0; + } + } + static void falp_1bw_64ow_64crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; + for (int i = 0; i < 16; ++i) + { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } + } + static void falp_2bw_64ow_64crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; + for (int i = 0; i < 16; ++i) + { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 = (register_0) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } + } + static void falp_3bw_64ow_64crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; + for (int i = 0; i < 16; ++i) + { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } + } + static void falp_4bw_64ow_64crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; + for (int i = 0; i < 16; ++i) + { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 = (register_0) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 = (register_0) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 = (register_0) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } + } + static void falp_5bw_64ow_64crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; + for (int i = 0; i < 16; ++i) + { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } + } + static void falp_6bw_64ow_64crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; + for (int i = 0; i < 16; ++i) + { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 = (register_0) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } + } + static void falp_7bw_64ow_64crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; + for (int i = 0; i < 16; ++i) + { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } + } + static void falp_8bw_64ow_64crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; + for (int i = 0; i < 16; ++i) + { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 = (register_0) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 = (register_0) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 = (register_0) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 = (register_0) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 = (register_0) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 = (register_0) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 = (register_0) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } + } + static void falp_9bw_64ow_64crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; + for (int i = 0; i < 16; ++i) + { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } + } + static void falp_10bw_64ow_64crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; + for (int i = 0; i < 16; ++i) + { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 = (register_0) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } + } + static void falp_11bw_64ow_64crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; + for (int i = 0; i < 16; ++i) + { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } + } + static void falp_12bw_64ow_64crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; + for (int i = 0; i < 16; ++i) + { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 = (register_0) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 = (register_0) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 = (register_0) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } + } + static void falp_13bw_64ow_64crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; + for (int i = 0; i < 16; ++i) + { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } + } + static void falp_14bw_64ow_64crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; + for (int i = 0; i < 16; ++i) + { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 = (register_0) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } + } + static void falp_15bw_64ow_64crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; + for (int i = 0; i < 16; ++i) + { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 13; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } + } + static void falp_16bw_64ow_64crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; + for (int i = 0; i < 16; ++i) + { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } + } + static void falp_17bw_64ow_64crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; + for (int i = 0; i < 16; ++i) + { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 13; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 15; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } + } + static void falp_18bw_64ow_64crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; + for (int i = 0; i < 16; ++i) + { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 = (register_0) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } + } + static void falp_19bw_64ow_64crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; + for (int i = 0; i < 16; ++i) + { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 13; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 15; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 17; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } + } + static void falp_20bw_64ow_64crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; + for (int i = 0; i < 16; ++i) + { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 = (register_0) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 = (register_0) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 = (register_0) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } + } + static void falp_21bw_64ow_64crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; + for (int i = 0; i < 16; ++i) + { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 13; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 15; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 17; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 19; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } + } + static void falp_22bw_64ow_64crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; + for (int i = 0; i < 16; ++i) + { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 = (register_0) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } + } + static void falp_23bw_64ow_64crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; + for (int i = 0; i < 16; ++i) + { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 13; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 21; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 19; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 17; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 15; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } + } + static void falp_24bw_64ow_64crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; + for (int i = 0; i < 16; ++i) + { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 = (register_0) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 = (register_0) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 = (register_0) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 = (register_0) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 = (register_0) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 = (register_0) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 = (register_0) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } + } + static void falp_25bw_64ow_64crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; + for (int i = 0; i < 16; ++i) + { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 17; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 23; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 15; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 21; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 13; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 19; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } + } + static void falp_26bw_64ow_64crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; + for (int i = 0; i < 16; ++i) + { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 = (register_0) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } + } + static void falp_27bw_64ow_64crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; + for (int i = 0; i < 16; ++i) + { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 13; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 23; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 19; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 15; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 25; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 21; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 17; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } + } + static void falp_28bw_64ow_64crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; + for (int i = 0; i < 16; ++i) + { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 = (register_0) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 = (register_0) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 = (register_0) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } + } + static void falp_29bw_64ow_64crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; + for (int i = 0; i < 16; ++i) + { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 13; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 19; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 25; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 15; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 21; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 27; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 17; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 23; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } + } + static void falp_30bw_64ow_64crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; + for (int i = 0; i < 16; ++i) + { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 = (register_0) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } + } + static void falp_31bw_64ow_64crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; + for (int i = 0; i < 16; ++i) + { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 29) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 13; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 15; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 17; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 19; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 21; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 23; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 25; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 27; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 29) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 29; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } + } + static void falp_32bw_64ow_64crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; + for (int i = 0; i < 16; ++i) + { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } + } + static void falp_33bw_64ow_64crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; + for (int i = 0; i < 16; ++i) + { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 31) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 31; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 29) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 29; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 27; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 25; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 23; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 21; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 19; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 17; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 15; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 13; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 29) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 31) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } + } + static void falp_34bw_64ow_64crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; + for (int i = 0; i < 16; ++i) + { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 = (register_0) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } + } + static void falp_35bw_64ow_64crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; + for (int i = 0; i < 16; ++i) + { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 29) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 29; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 23; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 17; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 31) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 33) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 33; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 27; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 21; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 15; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 33) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 31) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 31; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 25; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 19; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 13; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 29) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } + } + static void falp_36bw_64ow_64crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; + for (int i = 0; i < 16; ++i) + { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 = (register_0) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 = (register_0) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 = (register_0) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } + } + static void falp_37bw_64ow_64crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; + for (int i = 0; i < 16; ++i) + { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 27; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 17; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 33) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 31) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 31; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 21; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 29) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 35) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 35; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 25; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 15; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 35) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 29) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 29; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 19; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 31) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 33) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 33; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 23; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 13; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } + } + static void falp_38bw_64ow_64crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; + for (int i = 0; i < 16; ++i) + { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 = (register_0) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } + } + static void falp_39bw_64ow_64crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; + for (int i = 0; i < 16; ++i) + { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 25; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 31) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 33) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 33; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 19; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 37) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 27; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 13; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 38; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 29) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 35) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 35; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 21; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 35) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 29) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 29; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 15; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 37) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 37; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 23; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 33) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 31) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 31; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 17; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } + } + static void falp_40bw_64ow_64crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; + for (int i = 0; i < 16; ++i) + { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 = (register_0) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 = (register_0) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 = (register_0) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 = (register_0) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 = (register_0) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 = (register_0) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 = (register_0) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } + } + static void falp_41bw_64ow_64crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; + for (int i = 0; i < 16; ++i) + { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 41) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 23; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 41) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 41) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 31) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 33) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 33; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 41) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 15; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 38; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 41) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 41) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 39) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 25; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 41) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 41) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 29) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 35) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 35; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 41) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 17; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 41) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 41) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 37) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 27; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 41) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 41) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 37) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 37; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 41) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 19; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 41) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 41) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 35) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 29) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 29; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 41) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 41) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 39) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 39; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 41) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 21; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 41) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 41) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 33) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 31) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 31; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 41) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 13; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 41) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 41) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } + } + static void falp_42bw_64ow_64crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; + for (int i = 0; i < 16; ++i) + { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 42) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 42) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 42) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 42) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 42) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 42) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 42) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 42) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 42) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 38; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 42) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 42) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 = (register_0) & ((1ULL << 42) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 42) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 42) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 42) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 42) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 42) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 42) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 42) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 42) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 38; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 42) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 42) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } + } + static void falp_43bw_64ow_64crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; + for (int i = 0; i < 16; ++i) + { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 43) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 21; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 42; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 43) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 41) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 41; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 43) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 19; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 43) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 39) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 39; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 43) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 17; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 38; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 43) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 37) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 37; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 43) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 15; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 43) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 29) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 35) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 35; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 43) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 13; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 43) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 31) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 33) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 33; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 43) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 43) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 33) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 31) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 31; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 43) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 43) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 35) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 29) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 29; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 43) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 43) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 37) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 27; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 43) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 43) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 39) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 25; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 43) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 43) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 41) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 23; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 43) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 43) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } + } + static void falp_44bw_64ow_64crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; + for (int i = 0; i < 16; ++i) + { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 44) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 44) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 44) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 44) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 44) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 = (register_0) & ((1ULL << 44) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 44) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 44) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 44) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 44) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 = (register_0) & ((1ULL << 44) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 44) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 44) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 44) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 44) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 = (register_0) & ((1ULL << 44) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 44) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 44) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 44) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 44) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } + } + static void falp_45bw_64ow_64crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; + for (int i = 0; i < 16; ++i) + { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 45) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 19; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 38; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 45) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 33) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 31) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 31; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 45) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 43) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 43; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 45) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 17; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 45) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 35) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 29) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 29; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 45) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 41) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 41; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 45) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 15; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 45) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 37) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 27; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 45) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 39) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 39; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 45) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 13; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 45) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 39) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 25; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 45) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 37) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 37; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 45) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 45) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 41) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 23; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 42; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 45) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 29) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 35) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 35; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 45) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 45) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 43) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 21; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 45) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 31) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 33) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 33; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 45) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 45) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } + } + static void falp_46bw_64ow_64crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; + for (int i = 0; i < 16; ++i) + { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 46) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 46) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 46) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 46) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 42; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 46) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 46) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 46) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 46) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 38; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 46) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 = (register_0) & ((1ULL << 46) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 46) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 46) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 46) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 42; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 46) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 46) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 46) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 46) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 38; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 46) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 720); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } + } + static void falp_47bw_64ow_64crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; + for (int i = 0; i < 16; ++i) + { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 47) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 17; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 47) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 43) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 21; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 38; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 47) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 39) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 25; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 42; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 47) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 35) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 29) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 29; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 46; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 47) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 31) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 33) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 33; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 47) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 37) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 37; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 47) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 41) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 41; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 47) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 45) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 45; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 47) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 15; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 47) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 45) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 19; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 47) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 41) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 23; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 47) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 37) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 27; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 47) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 33) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 31) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 31; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 47) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 46) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 29) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 35) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 35; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 47) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 39) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 39; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 47) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 43) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 43; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 47) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 720); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 13; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 47) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } + } + static void falp_48bw_64ow_64crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; + for (int i = 0; i < 16; ++i) + { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 = (register_0) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 = (register_0) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 = (register_0) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 = (register_0) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 = (register_0) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 = (register_0) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 = (register_0) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 = (register_0) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 = (register_0) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 = (register_0) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 = (register_0) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 = (register_0) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 = (register_0) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 = (register_0) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 720); + tmp_0 = (register_0) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 752); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } + } + static void falp_49bw_64ow_64crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; + for (int i = 0; i < 16; ++i) + { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 49) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 15; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 45) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 45; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 49) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 41) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 41; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 49) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 37) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 37; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 49) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 46) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 31) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 33) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 33; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 49) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 35) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 29) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 29; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 49) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 39) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 25; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 49) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 43) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 21; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 49) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 47) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 17; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 47) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 47; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 49) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 13; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 43) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 43; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 49) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 39) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 39; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 49) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 29) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 35) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 35; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 49) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 33) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 31) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 31; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 46; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 49) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 37) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 27; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 42; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 49) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 41) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 23; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 720); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 38; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 49) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 45) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 752); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 19; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 768); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 49) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } + } + static void falp_50bw_64ow_64crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; + for (int i = 0; i < 16; ++i) + { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 50) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 42; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 50) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 50) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 50) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 46) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 46; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 50) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 38; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 50) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 50) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 50) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 = (register_0) & ((1ULL << 50) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 42; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 50) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 50) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 50) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 46) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 46; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 50) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 38; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 50) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 720); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 50) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 752); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 768); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 784); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 50) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } + } + static void falp_51bw_64ow_64crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; + for (int i = 0; i < 16; ++i) + { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 51) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 13; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 39) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 39; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 51) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 50) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 37) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 27; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 51) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 49) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 15; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 41) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 41; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 51) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 35) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 29) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 29; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 42; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 51) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 47) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 17; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 43) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 43; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 51) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 46) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 33) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 31) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 31; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 51) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 45) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 19; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 45) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 45; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 51) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 31) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 33) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 33; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 46; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 51) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 43) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 21; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 47) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 47; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 51) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 29) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 35) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 35; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 51) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 41) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 23; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 49) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 49; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 51) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 720); + tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 37) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 37; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 50) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 752); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 50; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 51) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 768); + tmp_0 |= ((register_0) & ((1ULL << 39) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 784); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 25; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 800); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 38; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 51) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } + } + static void falp_52bw_64ow_64crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; + for (int i = 0; i < 16; ++i) + { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 52) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 52) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 52) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 = (register_0) & ((1ULL << 52) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 52) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 52) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 = (register_0) & ((1ULL << 52) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 52) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 52) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 = (register_0) & ((1ULL << 52) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 52) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 720); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 752); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 52) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 768); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 784); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 800); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 816); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } + } + static void falp_53bw_64ow_64crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; + for (int i = 0; i < 16; ++i) + { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 53) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 31) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 33) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 33; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 53) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 51) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 13; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 29) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 35) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 35; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 46; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 53) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 49) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 15; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 37) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 37; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 53) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 47) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 17; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 39) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 39; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 50) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 50; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 53) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 45) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 19; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 41) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 41; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 52; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 53) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 43) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 21; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 43) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 43; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 53) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 52) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 41) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 23; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 45) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 45; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 53) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 50) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 39) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 25; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 47) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 47; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 53) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 37) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 27; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 38; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 49) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 49; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 53) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 46) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 720); + tmp_0 |= ((register_0) & ((1ULL << 35) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 29) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 29; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 752); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 51) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 768); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 51; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 53) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 784); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 800); + tmp_0 |= ((register_0) & ((1ULL << 33) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 31) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 816); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 31; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 832); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 42; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 53) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } + } + static void falp_54bw_64ow_64crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; + for (int i = 0; i < 16; ++i) + { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 54) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 50) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 50; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 54) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 46; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 54) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 52) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 42; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 52; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 54) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 46) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 38; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 54) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 50) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 54) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 = (register_0) & ((1ULL << 54) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 50) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 50; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 54) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 46; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 54) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 52) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 42; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 52; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 54) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 46) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 720); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 752); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 38; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 768); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 54) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 784); + tmp_0 |= ((register_0) & ((1ULL << 50) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 800); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 816); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 832); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 848); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 54) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } + } + static void falp_55bw_64ow_64crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; + for (int i = 0; i < 16; ++i) + { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 55) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 46) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 37) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 27; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 45) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 45; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 54) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 54; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 55) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 47) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 17; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 29) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 35) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 35; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 53) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 53; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 55) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 39) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 25; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 43) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 43; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 52; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 55) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 49) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 15; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 31) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 33) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 33; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 42; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 51) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 51; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 55) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 50) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 41) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 23; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 41) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 41; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 50) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 50; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 55) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 51) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 13; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 33) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 31) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 31; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 49) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 49; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 55) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 52) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 43) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 21; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 39) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 39; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 55) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 53) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 720); + tmp_0 |= ((register_0) & ((1ULL << 35) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 29) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 29; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 752); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 38; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 47) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 768); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 47; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 55) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 784); + tmp_0 |= ((register_0) & ((1ULL << 54) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 800); + tmp_0 |= ((register_0) & ((1ULL << 45) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 816); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 19; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 832); + tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 37) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 848); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 37; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 864); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 46; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 55) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } + } + static void falp_56bw_64ow_64crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; + for (int i = 0; i < 16; ++i) + { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 56) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 = (register_0) & ((1ULL << 56) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 = (register_0) & ((1ULL << 56) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 = (register_0) & ((1ULL << 56) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 = (register_0) & ((1ULL << 56) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 = (register_0) & ((1ULL << 56) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 = (register_0) & ((1ULL << 56) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 720); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 752); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 768); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 784); + tmp_0 = (register_0) & ((1ULL << 56) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 800); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 816); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 832); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 848); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 864); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 880); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } + } + static void falp_57bw_64ow_64crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; + for (int i = 0; i < 16; ++i) + { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 57) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 50) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 43) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 21; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 29) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 35) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 35; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 42; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 49) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 49; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 56; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 57) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 51) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 13; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 37) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 27; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 41) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 41; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 55) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 55; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 57) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 52) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 45) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 19; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 31) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 33) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 33; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 47) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 47; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 54) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 54; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 57) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 53) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 46) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 39) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 25; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 39) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 39; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 46; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 53) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 53; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 57) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 54) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 47) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 17; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 33) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 31) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 31; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 38; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 45) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 45; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 52; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 57) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 55) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 41) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 23; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 720); + tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 37) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 37; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 752); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 51) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 768); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 51; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 57) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 784); + tmp_0 |= ((register_0) & ((1ULL << 56) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 800); + tmp_0 |= ((register_0) & ((1ULL << 49) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 816); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 15; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 832); + tmp_0 |= ((register_0) & ((1ULL << 35) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 29) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 848); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 29; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 864); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 43) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 880); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 43; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 50) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 896); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 50; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 57) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } + } + static void falp_58bw_64ow_64crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; + for (int i = 0; i < 16; ++i) + { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 58) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 52) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 46) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 42; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 54) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 54; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 58) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 56) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 50) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 38; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 50) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 50; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 56; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 58) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 54) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 46; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 52; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 58) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 = (register_0) & ((1ULL << 58) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 52) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 46) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 42; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 54) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 54; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 58) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 56) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 50) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 720); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 38; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 50) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 752); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 50; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 768); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 56; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 58) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 784); + tmp_0 |= ((register_0) & ((1ULL << 54) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 800); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 816); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 832); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 848); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 864); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 880); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 896); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 46; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 912); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 52; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 58) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } + } + static void falp_59bw_64ow_64crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; + for (int i = 0; i < 16; ++i) + { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 59) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 54) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 49) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 15; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 39) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 25; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 29) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 35) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 35; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 45) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 45; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 50) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 50; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 55) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 55; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 59) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 58) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 53) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 43) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 21; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 33) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 31) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 31; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 41) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 41; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 46; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 51) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 51; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 56; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 59) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 57) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 52) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 47) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 17; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 37) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 27; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 37) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 37; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 42; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 47) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 47; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 52; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 57) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 57; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 59) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 56) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 51) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 46) - 1)) << 13; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 41) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 23; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 31) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 33) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 33; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 38; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 43) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 43; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 720); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 53) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 53; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 58) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 752); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 58; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 59) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 768); + tmp_0 |= ((register_0) & ((1ULL << 55) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 784); + tmp_0 |= ((register_0) & ((1ULL << 50) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 800); + tmp_0 |= ((register_0) & ((1ULL << 45) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 816); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 19; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 832); + tmp_0 |= ((register_0) & ((1ULL << 35) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 29) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 848); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 29; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 864); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 39) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 880); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 39; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 896); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 49) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 912); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 49; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 54) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 928); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 54; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 59) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } + } + static void falp_60bw_64ow_64crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; + for (int i = 0; i < 16; ++i) + { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 60) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 56) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 52) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 52; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 56; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 60) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 = (register_0) & ((1ULL << 60) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 56) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 52) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 52; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 56; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 60) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 = (register_0) & ((1ULL << 60) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 56) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 52) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 52; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 56; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 60) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 720); + tmp_0 = (register_0) & ((1ULL << 60) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 56) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 752); + tmp_0 |= ((register_0) & ((1ULL << 52) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 768); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 784); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 800); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 816); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 832); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 848); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 864); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 880); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 896); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 912); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 928); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 52; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 944); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 56; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 60) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } + } + static void falp_61bw_64ow_64crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; + for (int i = 0; i < 16; ++i) + { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 61) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 58) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 55) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 52) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 49) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 46) - 1)) << 15; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 43) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 21; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 37) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 27; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 31) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 33) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 33; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 39) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 39; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 42; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 45) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 45; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 51) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 51; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 54) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 54; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 57) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 57; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 60) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 60; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 61) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 59) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 56) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 53) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 50) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 47) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 17; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 41) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 23; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 35) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 29) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 29; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 29) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 35) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 35; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 38; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 41) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 41; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 47) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 47; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 50) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 50; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 53) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 53; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 56; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 59) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 59; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 61) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 60) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 57) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 54) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 51) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 720); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 13; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 45) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 752); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 19; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 768); + tmp_0 |= ((register_0) & ((1ULL << 39) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 784); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 25; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 800); + tmp_0 |= ((register_0) & ((1ULL << 33) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 31) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 816); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 31; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 832); + tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 37) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 848); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 37; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 864); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 43) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 880); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 43; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 896); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 46; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 49) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 912); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 49; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 928); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 52; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 55) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 944); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 55; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 58) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 960); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 58; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 61) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } + } + static void falp_62bw_64ow_64crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; + for (int i = 0; i < 16; ++i) + { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 62) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 60) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 58) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 56) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 54) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 52) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 50) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 46) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 38; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 42; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 46; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 50) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 50; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 52; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 54) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 54; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 56; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 58) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 58; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 60) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 60; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 62) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 = (register_0) & ((1ULL << 62) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 60) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 58) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 56) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 54) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 52) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 50) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 46) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 720); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 752); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 768); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 784); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 800); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 38; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 816); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 832); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 42; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 848); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 864); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 46; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 880); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 50) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 896); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 50; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 912); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 52; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 54) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 928); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 54; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 944); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 56; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 58) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 960); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 58; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 60) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 976); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 60; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 62) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } + } + static void falp_63bw_64ow_64crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; + for (int i = 0; i < 16; ++i) + { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 63) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 62) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 61) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 60) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 59) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 58) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 57) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 56) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 55) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 54) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 53) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 52) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 51) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 50) - 1)) << 13; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 49) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 15; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 47) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 46) - 1)) << 17; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 45) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 19; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 43) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 21; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 41) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 23; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 39) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 25; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 37) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 27; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 35) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 29) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 29; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 33) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 31) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 31; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 31) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 33) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 33; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 29) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 35) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 35; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 37) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 37; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 38; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 39) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 39; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 41) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 41; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 42; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 43) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 43; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 45) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 720); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 45; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 46; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 47) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 752); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 47; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 768); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 49) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 784); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 49; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 50) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 800); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 50; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 51) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 816); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 51; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 832); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 52; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 53) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 848); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 53; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 54) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 864); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 54; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 55) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 880); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 55; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 896); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 56; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 57) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 912); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 57; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 58) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 928); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 58; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 59) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 944); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 59; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 60) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 960); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 60; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 61) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 976); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 61; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 62) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 992); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 62; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 63) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } + } + static void falp_64bw_64ow_64crw_1uf(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; + for (int i = 0; i < 16; ++i) + { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 16); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 32); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 48); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 64); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 80); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 96); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 112); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 128); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 144); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 160); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 176); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 192); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 208); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 224); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 240); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 256); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 272); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 288); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 304); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 320); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 336); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 352); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 368); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 384); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 400); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 416); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 432); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 448); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 464); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 480); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 496); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 512); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 528); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 544); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 560); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 576); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 592); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 608); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 624); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 640); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 656); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 672); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 688); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 704); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 720); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 736); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 752); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 768); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 784); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 800); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 816); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 832); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 848); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 864); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 880); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 896); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 912); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 928); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 944); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 960); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 976); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 992); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 1008); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = register_0; + } + } + void falp(const uint64_t *__restrict a_in_p, double *__restrict a_out_p, uint8_t bw, const uint64_t *__restrict a_base_p, uint8_t fac, uint8_t exp) + { + switch (bw) + { + case 0: + falp_0bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 1: + falp_1bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 2: + falp_2bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 3: + falp_3bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 4: + falp_4bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 5: + falp_5bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 6: + falp_6bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 7: + falp_7bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 8: + falp_8bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 9: + falp_9bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 10: + falp_10bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 11: + falp_11bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 12: + falp_12bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 13: + falp_13bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 14: + falp_14bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 15: + falp_15bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 16: + falp_16bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 17: + falp_17bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 18: + falp_18bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 19: + falp_19bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 20: + falp_20bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 21: + falp_21bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 22: + falp_22bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 23: + falp_23bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 24: + falp_24bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 25: + falp_25bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 26: + falp_26bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 27: + falp_27bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 28: + falp_28bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 29: + falp_29bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 30: + falp_30bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 31: + falp_31bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 32: + falp_32bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 33: + falp_33bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 34: + falp_34bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 35: + falp_35bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 36: + falp_36bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 37: + falp_37bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 38: + falp_38bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 39: + falp_39bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 40: + falp_40bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 41: + falp_41bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 42: + falp_42bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 43: + falp_43bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 44: + falp_44bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 45: + falp_45bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 46: + falp_46bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 47: + falp_47bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 48: + falp_48bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 49: + falp_49bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 50: + falp_50bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 51: + falp_51bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 52: + falp_52bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 53: + falp_53bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 54: + falp_54bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 55: + falp_55bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 56: + falp_56bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 57: + falp_57bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 58: + falp_58bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 59: + falp_59bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 60: + falp_60bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 61: + falp_61bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 62: + falp_62bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 63: + falp_63bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 64: + falp_64bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + } + } + } + } +} +; diff --git a/generated/fallback/scalar_nav_uf1/fallback_scalar_nav_1024_uf1_falp_test.cpp b/generated/fallback/scalar_nav_uf1/fallback_scalar_nav_1024_uf1_falp_test.cpp new file mode 100644 index 0000000..556817f --- /dev/null +++ b/generated/fallback/scalar_nav_uf1/fallback_scalar_nav_1024_uf1_falp_test.cpp @@ -0,0 +1,119 @@ +#include "alp/alp.hpp" +#include "datasets.hpp" +#include "alp/ffor.hpp" +#include "alp/unffor.hpp" +#include "gtest/gtest.h" +class fallback_scalar_nav_1024_uf1_falp: public ::testing::Test +{ + public: + double * dbl_arr; + double * exc_arr; + uint16_t * pos_arr; + uint16_t * exc_c_arr; + int64_t * ffor_arr; + int64_t * unffor_arr; + int64_t * base_arr; + int64_t * dig_arr; + double * dec_dbl_arr; + uint8_t bw; + uint8_t factor; + uint8_t exponent; + double * smp_arr; + void SetUp() override + { + dbl_arr = new double[1024]; + exc_arr = new double[1024]; + pos_arr = new uint16_t[1024]; + dig_arr = new int64_t[1024]; + dec_dbl_arr = new double[1024]; + exc_c_arr = new uint16_t[1024]; + ffor_arr = new int64_t[1024]; + unffor_arr = new int64_t[1024]; + base_arr = new int64_t[1024]; + smp_arr = new double[ALP_VECTOR_SIZE]; + } + ~fallback_scalar_nav_1024_uf1_falp () override + { + delete[] dbl_arr; + delete[] exc_arr; + delete[] pos_arr; + delete[] dig_arr; + delete[] dec_dbl_arr; + delete[] exc_c_arr; + delete[] ffor_arr; + delete[] unffor_arr; + delete[] base_arr; + delete[] smp_arr; + } +} +; +TEST_F(fallback_scalar_nav_1024_uf1_falp, fused) +{ + for (auto & dataset : alp_bench::datasets) + { + std:: ifstream ifile(dataset.sample_csv_file_path, std::ios:: in ); + ASSERT_EQ(ifile.fail(), false); + alp::state stt; + if (dataset.suitable_for_cutting) { continue; } + if (dataset.name.find("bw") != std::string::npos) { continue; } + double num = 0.0; + size_t c {0}; + while (ifile >> num) + { + dbl_arr[c] = num; + c = c + 1; + } + // Init + alp::AlpEncode::init(dbl_arr, 0, 1024, smp_arr, stt); + // Encode + alp::AlpEncode::encode(dbl_arr, exc_arr, pos_arr, exc_c_arr, dig_arr, stt); + alp::AlpEncode::analyze_ffor(dig_arr, bw, base_arr); + alp::generated::ffor::fallback::scalar::ffor(dig_arr, ffor_arr, bw, base_arr); + // Decode + generated::falp::fallback::scalar::falp(reinterpret_cast < uint64_t * > (ffor_arr), dec_dbl_arr, bw, reinterpret_cast < uint64_t * > (base_arr),stt.fac, stt.exp); + alp::AlpDecode::patch_exceptions(dec_dbl_arr, exc_arr, pos_arr, exc_c_arr); + for (size_t i = 0; i < 1024; ++i) + { + ASSERT_EQ(dbl_arr[i], dec_dbl_arr[i]); + } + ASSERT_EQ(dataset.exceptions_count, exc_c_arr[0]); + ASSERT_EQ(dataset.bit_width, bw); + ifile.close(); + } +} + +TEST_F(fallback_scalar_nav_1024_uf1_falp, unfused) +{ + for (auto & dataset : alp_bench::datasets) + { + std:: ifstream ifile(dataset.sample_csv_file_path, std::ios:: in ); + ASSERT_EQ(ifile.fail(), false); + alp::state stt; + if (dataset.suitable_for_cutting) { continue; } + if (dataset.name.find("bw") != std::string::npos) { continue; } + double num = 0.0; + size_t c {0}; + while (ifile >> num) + { + dbl_arr[c] = num; + c = c + 1; + } + // Init + alp::AlpEncode::init(dbl_arr, 0, 1024, smp_arr, stt); + // Encode + alp::AlpEncode::encode(dbl_arr, exc_arr, pos_arr, exc_c_arr, dig_arr, stt); + alp::AlpEncode::analyze_ffor(dig_arr, bw, base_arr); + alp::generated::ffor::fallback::scalar::ffor(dig_arr, ffor_arr, bw, base_arr); + // Decode + alp::generated::unffor::fallback::scalar::unffor(ffor_arr, unffor_arr, bw, base_arr); + alp::AlpDecode(reinterpret_cast(unffor_arr), stt.fac, stt.exp, dec_dbl_arr); + alp::AlpDecode::patch_exceptions(dec_dbl_arr, exc_arr, pos_arr, exc_c_arr); + for (size_t i = 0; i < 1024; ++i) + { + ASSERT_EQ(dbl_arr[i], dec_dbl_arr[i]); + } + ASSERT_EQ(dataset.exceptions_count, exc_c_arr[0]); + ASSERT_EQ(dataset.bit_width, bw); + ifile.close(); + } +} diff --git a/generated/fallback/scalar_nav_uf1/falp.cmake b/generated/fallback/scalar_nav_uf1/falp.cmake new file mode 100644 index 0000000..4bf3e6b --- /dev/null +++ b/generated/fallback/scalar_nav_uf1/falp.cmake @@ -0,0 +1,31 @@ +add_library(fallback_scalar_nav_1024_uf1_falp OBJECT + fallback_scalar_nav_1024_uf1_falp_src.cpp) +target_compile_definitions(fallback_scalar_nav_1024_uf1_falp PRIVATE IS_SCALAR) +set(FLAG -O3) +check_cxx_compiler_flag(${FLAG} HAS_FLAG) +if(HAS_FLAG) +else() + message(STATUS "The flag ${FLAG} is not supported by the current compiler") +endif() +target_compile_options(fallback_scalar_nav_1024_uf1_falp PUBLIC ${FLAG}) +cmake_print_properties(TARGETS fallback_scalar_nav_1024_uf1_falp + PROPERTIES COMPILE_DEFINITIONS + PROPERTIES COMPILE_OPTIONS) +LIST (APPEND ALP_GENERATED_OBJECT_FILES + $) +get_target_property(TARGET_NAME fallback_scalar_nav_1024_uf1_falp NAME) +get_target_property(TARGET_COMPILE_OPTIONS fallback_scalar_nav_1024_uf1_falp COMPILE_OPTIONS) +#------------------------------------------------------------------------------------------------------ +add_executable(fallback_scalar_nav_1024_uf1_falp_test fallback_scalar_nav_1024_uf1_falp_test.cpp) +target_link_libraries(fallback_scalar_nav_1024_uf1_falp_test PRIVATE fallback_scalar_nav_1024_uf1_falp) +target_link_libraries(fallback_scalar_nav_1024_uf1_falp_test PRIVATE alp_ffor) +target_link_libraries(fallback_scalar_nav_1024_uf1_falp_test PRIVATE gtest_main) +target_include_directories(fallback_scalar_nav_1024_uf1_falp_test PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) +gtest_discover_tests(fallback_scalar_nav_1024_uf1_falp_test) +#------------------------------------------------------------------------------------------------------ +configure_file(${CMAKE_SOURCE_DIR}/alp_bench/alp_bench.hpp ${CMAKE_CURRENT_BINARY_DIR}/fallback_scalar_nav_1024_uf1_falp_bench.hpp) +add_executable(fallback_scalar_nav_1024_uf1_falp_bench fallback_scalar_nav_1024_uf1_falp_bench.cpp) +target_link_libraries(fallback_scalar_nav_1024_uf1_falp_bench PRIVATE fallback_scalar_nav_1024_uf1_falp) +target_link_libraries(fallback_scalar_nav_1024_uf1_falp_bench PRIVATE alp_ffor) +target_include_directories(fallback_scalar_nav_1024_uf1_falp_bench PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) +add_alp_benchmark(fallback_scalar_nav_1024_uf1_falp_bench) diff --git a/generated/generated_files.txt b/generated/generated_files.txt new file mode 100644 index 0000000..7e3599d --- /dev/null +++ b/generated/generated_files.txt @@ -0,0 +1,36 @@ +arm64v8/neon_intrinsic_uf1/arm64v8_neon_intrinsic_1024_uf1_falp_src.cpp +arm64v8/neon_intrinsic_uf1/arm64v8_neon_intrinsic_1024_uf1_falp_bench.cpp +arm64v8/neon_intrinsic_uf1/arm64v8_neon_intrinsic_1024_uf1_falp_test.cpp +arm64v8/neon_intrinsic_uf1/falp.cmake +arm64v8/sve_intrinsic_uf1/arm64v8_sve_intrinsic_1024_uf1_falp_src.cpp +arm64v8/sve_intrinsic_uf1/arm64v8_sve_intrinsic_1024_uf1_falp_bench.cpp +arm64v8/sve_intrinsic_uf1/arm64v8_sve_intrinsic_1024_uf1_falp_test.cpp +arm64v8/sve_intrinsic_uf1/falp.cmake +x86_64/sse_intrinsic_uf1/x86_64_sse_intrinsic_1024_uf1_falp_src.cpp +x86_64/sse_intrinsic_uf1/x86_64_sse_intrinsic_1024_uf1_falp_bench.cpp +x86_64/sse_intrinsic_uf1/x86_64_sse_intrinsic_1024_uf1_falp_test.cpp +x86_64/sse_intrinsic_uf1/falp.cmake +x86_64/avx2_intrinsic_uf1/x86_64_avx2_intrinsic_1024_uf1_falp_src.cpp +x86_64/avx2_intrinsic_uf1/x86_64_avx2_intrinsic_1024_uf1_falp_bench.cpp +x86_64/avx2_intrinsic_uf1/x86_64_avx2_intrinsic_1024_uf1_falp_test.cpp +x86_64/avx2_intrinsic_uf1/falp.cmake +x86_64/avx512bw_intrinsic_uf1/x86_64_avx512bw_intrinsic_1024_uf1_falp_src.cpp +x86_64/avx512bw_intrinsic_uf1/x86_64_avx512bw_intrinsic_1024_uf1_falp_bench.cpp +x86_64/avx512bw_intrinsic_uf1/x86_64_avx512bw_intrinsic_1024_uf1_falp_test.cpp +x86_64/avx512bw_intrinsic_uf1/falp.cmake +wasm/simd128_intrinsic_uf1/wasm_simd128_intrinsic_1024_uf1_falp_src.cpp +wasm/simd128_intrinsic_uf1/wasm_simd128_intrinsic_1024_uf1_falp_bench.cpp +wasm/simd128_intrinsic_uf1/wasm_simd128_intrinsic_1024_uf1_falp_test.cpp +wasm/simd128_intrinsic_uf1/falp.cmake +fallback/scalar_av_uf1/fallback_scalar_av_1024_uf1_falp_src.cpp +fallback/scalar_av_uf1/fallback_scalar_av_1024_uf1_falp_bench.cpp +fallback/scalar_av_uf1/fallback_scalar_av_1024_uf1_falp_test.cpp +fallback/scalar_av_uf1/falp.cmake +fallback/scalar_aav_uf1/fallback_scalar_aav_1024_uf1_falp_src.cpp +fallback/scalar_aav_uf1/fallback_scalar_aav_1024_uf1_falp_bench.cpp +fallback/scalar_aav_uf1/fallback_scalar_aav_1024_uf1_falp_test.cpp +fallback/scalar_aav_uf1/falp.cmake +fallback/scalar_nav_uf1/fallback_scalar_nav_1024_uf1_falp_src.cpp +fallback/scalar_nav_uf1/fallback_scalar_nav_1024_uf1_falp_bench.cpp +fallback/scalar_nav_uf1/fallback_scalar_nav_1024_uf1_falp_test.cpp +fallback/scalar_nav_uf1/falp.cmake diff --git a/include/alp.hpp b/include/alp.hpp new file mode 100644 index 0000000..794d6a7 --- /dev/null +++ b/include/alp.hpp @@ -0,0 +1,18 @@ +#ifndef ALP_ALP_HPP +#define ALP_ALP_HPP + +#include "alp/compressor.hpp" +#include "alp/config.hpp" +#include "alp/constants.hpp" +#include "alp/decode.hpp" +#include "alp/decompressor.hpp" +#include "alp/encode.hpp" +#include "alp/falp.hpp" +#include "alp/rd.hpp" +#include "alp/sampler.hpp" +#include "alp/storer.hpp" +#include "alp/utils.hpp" +#include "fastlanes/ffor.hpp" +#include "fastlanes/unffor.hpp" + +#endif // ALP_ALP_HPP diff --git a/include/alp/common.hpp b/include/alp/common.hpp new file mode 100644 index 0000000..0ffae8d --- /dev/null +++ b/include/alp/common.hpp @@ -0,0 +1,19 @@ +#ifndef ALP_COMMON_HPP +#define ALP_COMMON_HPP + +#include + +namespace alp { +//! bitwidth type +using bw_t = uint8_t; +//! exception counter type +using exp_c_t = uint16_t; +//! exception position type +using exp_p_t = uint16_t; +//! factor idx type +using factor_idx_t = uint8_t; +//! exponent idx type +using exponent_idx_t = uint8_t; +} // namespace alp + +#endif // ALP_COMMON_HPP diff --git a/include/alp/compressor.hpp b/include/alp/compressor.hpp new file mode 100644 index 0000000..ea0ca65 --- /dev/null +++ b/include/alp/compressor.hpp @@ -0,0 +1,212 @@ +#ifndef ALP_COMPRESSOR_HPP +#define ALP_COMPRESSOR_HPP + +#include "alp/encode.hpp" +#include "alp/rd.hpp" +#include "alp/state.hpp" +#include "alp/storer.hpp" +#include "alp/utils.hpp" +#include "fastlanes/ffor.hpp" + +namespace alp { + +/* + * API Compressor + */ +template +struct AlpCompressor { + + using EXACT_TYPE = typename FloatingToExact::type; + + state stt; + storer::MemStorer storer; + + T input_vector[config::VECTOR_SIZE]; + T exceptions[config::VECTOR_SIZE]; + T sample_array[config::VECTOR_SIZE]; + int64_t encoded_integers[config::VECTOR_SIZE]; + int64_t alp_encoded_array[config::VECTOR_SIZE]; + uint16_t exceptions_rd[config::VECTOR_SIZE]; + uint16_t exceptions_position[config::VECTOR_SIZE]; + + // 'right' & 'left' refer to the respective parts of the floating numbers after splitting (alprd) + uint64_t alp_bp_size; + uint64_t left_bp_size; + uint64_t right_bp_size; + EXACT_TYPE right_parts[config::VECTOR_SIZE]; + EXACT_TYPE right_parts_encoded[config::VECTOR_SIZE]; + uint16_t left_parts_encoded[config::VECTOR_SIZE]; + uint16_t left_parts[config::VECTOR_SIZE]; + + EXACT_TYPE right_for_base = 0; // Always 0 + + AlpCompressor() {} + + size_t get_size() { return storer.get_size(); } + + /* + * ALP Compression + * Note that Kernels of ALP and FFOR are not fused + */ + void compress_vector() { + if (stt.scheme == SCHEME::ALP_RD) { + compress_rd_vector(); + } else { + compress_alp_vector(); + } + } + + void compress_alp_vector() { + AlpEncode::encode( + input_vector, exceptions, exceptions_position, &stt.exceptions_count, encoded_integers, stt); + AlpEncode::analyze_ffor(encoded_integers, stt.bit_width, &stt.for_base); + ffor::ffor(encoded_integers, alp_encoded_array, stt.bit_width, &stt.for_base); + alp_bp_size = AlpApiUtils::get_size_after_bitpacking(stt.bit_width); + } + + void compress_rd_vector() { + AlpRD::encode( + input_vector, exceptions_rd, exceptions_position, &stt.exceptions_count, right_parts, left_parts, stt); + ffor::ffor(right_parts, right_parts_encoded, stt.right_bit_width, &right_for_base); + ffor::ffor(left_parts, left_parts_encoded, stt.left_bit_width, &stt.left_for_base); + } + + void compress(T* values, size_t values_count, uint8_t* out) { + storer = storer::MemStorer(out); + size_t rouwgroup_count = AlpApiUtils::get_rowgroup_count(values_count); + size_t current_idx = 0; + size_t left_to_compress = values_count; + for (size_t current_rowgroup = 0; current_rowgroup < rouwgroup_count; current_rowgroup++) { + /* + * Rowgroup level + */ + AlpEncode::init(values, current_idx, values_count, sample_array, stt); + if (stt.scheme == SCHEME::ALP_RD) { + AlpRD::init(values, current_idx, values_count, sample_array, stt); + left_bp_size = AlpApiUtils::get_size_after_bitpacking(stt.left_bit_width); + right_bp_size = AlpApiUtils::get_size_after_bitpacking(stt.right_bit_width); + } + store_rowgroup_metadata(); + + size_t values_left_in_rowgroup = std::min(config::ROWGROUP_SIZE, left_to_compress); + size_t vectors_in_rowgroup = AlpApiUtils::get_complete_vector_count(values_left_in_rowgroup); + for (size_t vector_idx = 0; vector_idx < vectors_in_rowgroup; vector_idx++) { + /* + * Vector level + */ + for (T& idx : input_vector) { + idx = values[current_idx++]; + } + compress_vector(); + store_vector(); + left_to_compress -= config::VECTOR_SIZE; + } + } + if (left_to_compress) { // Last vector which may be incomplete + stt.vector_size = left_to_compress; + for (size_t idx = 0; idx < left_to_compress; idx++) { + input_vector[idx] = values[current_idx++]; + } + if (stt.scheme == SCHEME::ALP_RD) { + AlpApiUtils::fill_incomplete_alprd_vector(input_vector, stt); + } else { + AlpApiUtils::fill_incomplete_alp_vector( + input_vector, exceptions, exceptions_position, &stt.exceptions_count, encoded_integers, stt); + } + compress_vector(); + store_vector(); + } + }; + + void compress_rd(T* values, size_t values_count, uint8_t* out) { + storer = storer::MemStorer(out); + size_t rouwgroup_count = AlpApiUtils::get_rowgroup_count(values_count); + size_t current_idx = 0; + size_t left_to_compress = values_count; + for (size_t current_rowgroup = 0; current_rowgroup < rouwgroup_count; current_rowgroup++) { + /* + * Rowgroup level + */ + AlpRD::init(values, current_idx, values_count, sample_array, stt); + left_bp_size = AlpApiUtils::get_size_after_bitpacking(stt.left_bit_width); + right_bp_size = AlpApiUtils::get_size_after_bitpacking(stt.right_bit_width); + + store_rowgroup_metadata(); + + size_t values_left_in_rowgroup = std::min(config::ROWGROUP_SIZE, left_to_compress); + size_t vectors_in_rowgroup = AlpApiUtils::get_complete_vector_count(values_left_in_rowgroup); + for (size_t vector_idx = 0; vector_idx < vectors_in_rowgroup; vector_idx++) { + /* + * Vector level + */ + for (size_t idx = 0; idx < config::VECTOR_SIZE; idx++) { + input_vector[idx] = values[current_idx++]; + } + compress_rd_vector(); + store_rd_vector(); + left_to_compress -= config::VECTOR_SIZE; + } + } + if (left_to_compress) { // Last vector which may be incomplete + stt.vector_size = left_to_compress; + for (size_t idx = 0; idx < left_to_compress; idx++) { + input_vector[idx] = values[current_idx++]; + } + AlpApiUtils::fill_incomplete_alprd_vector(input_vector, stt); + compress_rd_vector(); + store_rd_vector(); + } + } + + void store_rd_vector() { + storer.store((void*)&stt.exceptions_count, sizeof(stt.exceptions_count)); + storer.store((void*)left_parts_encoded, left_bp_size); + storer.store((void*)right_parts_encoded, right_bp_size); + if (stt.exceptions_count) { + storer.store((void*)exceptions_rd, RD_EXCEPTION_SIZE_BYTES * stt.exceptions_count); + storer.store((void*)exceptions_position, RD_EXCEPTION_POSITION_SIZE_BYTES * stt.exceptions_count); + } + } + + void store_alp_vector() { + storer.store((void*)&stt.exp, sizeof(stt.exp)); + storer.store((void*)&stt.fac, sizeof(stt.fac)); + storer.store((void*)&stt.exceptions_count, sizeof(stt.exceptions_count)); + storer.store((void*)&stt.for_base, sizeof(stt.for_base)); + storer.store((void*)&stt.bit_width, sizeof(stt.bit_width)); + storer.store((void*)alp_encoded_array, alp_bp_size); + if (stt.exceptions_count) { + storer.store((void*)exceptions, Constants::EXCEPTION_SIZE_BYTES * stt.exceptions_count); + storer.store((void*)exceptions_position, EXCEPTION_POSITION_SIZE_BYTES * stt.exceptions_count); + } + } + + void store_schema() { + uint8_t scheme_code = (uint8_t)stt.scheme; + storer.store((void*)&scheme_code, sizeof(scheme_code)); + } + + void store_vector() { + if (stt.scheme == SCHEME::ALP_RD) { + store_rd_vector(); + } else { + store_alp_vector(); + } + } + + void store_rd_metadata() { + storer.store((void*)&stt.right_bit_width, sizeof(stt.right_bit_width)); + storer.store((void*)&stt.left_bit_width, sizeof(stt.left_bit_width)); + storer.store((void*)&stt.actual_dictionary_size, sizeof(stt.actual_dictionary_size)); + storer.store((void*)stt.left_parts_dict, stt.actual_dictionary_size_bytes); + } + + void store_rowgroup_metadata() { + store_schema(); + if (stt.scheme == SCHEME::ALP_RD) { store_rd_metadata(); } + } +}; + +} // namespace alp + +#endif \ No newline at end of file diff --git a/include/alp/config.hpp b/include/alp/config.hpp new file mode 100644 index 0000000..7282440 --- /dev/null +++ b/include/alp/config.hpp @@ -0,0 +1,28 @@ +#ifndef ALP_CONFIG_HPP +#define ALP_CONFIG_HPP + +#include + +/* + * ALP Configs + */ +namespace alp::config { +/// ALP Vector size (We recommend against changing this; it should be constant) +inline constexpr size_t VECTOR_SIZE = 1024; +/// Rowgroup size +inline constexpr size_t ROWGROUP_SIZE = 100UL * VECTOR_SIZE; +/// Vectors from the rowgroup from which to take samples; this will be used to then calculate the jumps +inline constexpr size_t ROWGROUP_VECTOR_SAMPLES = 8; +/// We calculate how many equidistant vector we must jump within a rowgroup +inline constexpr size_t ROWGROUP_SAMPLES_JUMP = (ROWGROUP_SIZE / ROWGROUP_VECTOR_SAMPLES) / VECTOR_SIZE; +/// Values to sample per vector +inline constexpr size_t SAMPLES_PER_VECTOR = 32; +/// Maximum number of combinations obtained from row group sampling +inline constexpr size_t MAX_K_COMBINATIONS = 5; +inline constexpr size_t CUTTING_LIMIT = 16; +inline constexpr size_t MAX_RD_DICT_BIT_WIDTH = 3; +inline constexpr size_t MAX_RD_DICTIONARY_SIZE = (1 << MAX_RD_DICT_BIT_WIDTH); + +} // namespace alp::config + +#endif \ No newline at end of file diff --git a/include/alp/constants.hpp b/include/alp/constants.hpp new file mode 100644 index 0000000..23903e4 --- /dev/null +++ b/include/alp/constants.hpp @@ -0,0 +1,242 @@ +#ifndef ALP_CONSTANTS_HPP +#define ALP_CONSTANTS_HPP + +#include "alp/config.hpp" +#include +#include + +namespace alp { + +enum class SCHEME : uint8_t { + ALP_RD, + ALP, +}; + +template +struct FloatingToExact {}; + +template <> +struct FloatingToExact { + typedef uint64_t type; +}; + +template <> +struct FloatingToExact { + typedef uint32_t type; +}; + +inline constexpr uint8_t SAMPLING_EARLY_EXIT_THRESHOLD = 2; +inline constexpr double ENCODING_UPPER_LIMIT = 9223372036854774784; +inline constexpr double ENCODING_LOWER_LIMIT = -9223372036854774784; +inline constexpr uint8_t DICTIONARY_ELEMENT_SIZE_BYTES = 2; +inline constexpr uint8_t RD_EXCEPTION_POSITION_SIZE = 16; +inline constexpr uint8_t RD_EXCEPTION_POSITION_SIZE_BYTES = RD_EXCEPTION_POSITION_SIZE / 8; +inline constexpr uint8_t EXCEPTION_POSITION_SIZE = 16; +inline constexpr uint8_t EXCEPTION_POSITION_SIZE_BYTES = EXCEPTION_POSITION_SIZE / 8; +inline constexpr uint8_t RD_EXCEPTION_SIZE = 16; +inline constexpr uint8_t RD_EXCEPTION_SIZE_BYTES = RD_EXCEPTION_SIZE / 8; + +template +struct Constants {}; + +template <> +struct Constants { + /// 22 bits per value * 32 values in the sampled vector + static inline constexpr size_t RD_SIZE_THRESHOLD_LIMIT = 22 * alp::config::SAMPLES_PER_VECTOR; + static inline constexpr float MAGIC_NUMBER = 12582912.0; + static inline constexpr uint8_t EXCEPTION_SIZE = 32; + static inline constexpr uint8_t EXCEPTION_SIZE_BYTES = EXCEPTION_SIZE / 8; + static inline constexpr uint8_t MAX_EXPONENT = 10; + + // -Inf: 11111111100000000000000000000000 + // +Inf: 01111111100000000000000000000000 + // -0.0: 10000000000000000000000000000000 + static constexpr uint32_t NEGATIVE_ZERO = 0b10000000000000000000000000000000; + static constexpr uint32_t POSITIVE_INF = 0b11111111100000000000000000000000; + static constexpr uint32_t NEGATIVE_INF = 0b11111111100000000000000000000000; + + static inline constexpr float FRAC_ARR[] = { + 1.0, 0.1, 0.01, 0.001, 0.0001, 0.00001, 0.000001, 0.0000001, 0.00000001, 0.000000001, 0.0000000001}; + + static inline constexpr float EXP_ARR[] = { + 1.0, 10.0, 100.0, 1000.0, 10000.0, 100000.0, 1000000.0, 10000000.0, 100000000.0, 1000000000.0, 10000000000.0}; +}; + +template <> +struct Constants { + /// 48 bits per value * 32 values in the sampled vector + static inline constexpr size_t RD_SIZE_THRESHOLD_LIMIT = 48 * alp::config::SAMPLES_PER_VECTOR; + static inline constexpr double MAGIC_NUMBER {0x0018000000000000}; + static inline constexpr uint8_t EXCEPTION_SIZE = 64; + static inline constexpr uint8_t EXCEPTION_SIZE_BYTES = EXCEPTION_SIZE / 8; + static inline constexpr uint8_t MAX_EXPONENT = 18; + + // -Inf: 1111111111110000000000000000000000000000000000000000000000000000 + // +Inf: 0111111111110000000000000000000000000000000000000000000000000000 + // -0.0: 1000000000000000000000000000000000000000000000000000000000000000 + static constexpr uint64_t NEGATIVE_ZERO = 0b1000000000000000000000000000000000000000000000000000000000000000; + static constexpr uint64_t POSITIVE_INF = 0b0111111111110000000000000000000000000000000000000000000000000000; + static constexpr uint64_t NEGATIVE_INF = 0b1111111111110000000000000000000000000000000000000000000000000000; + + static inline constexpr double FRAC_ARR[] = { + 1.0, + 0.1, + 0.01, + 0.001, + 0.0001, + 0.00001, + 0.000001, + 0.0000001, + 0.00000001, + 0.000000001, + 0.0000000001, + 0.00000000001, + 0.000000000001, + 0.0000000000001, + 0.00000000000001, + 0.000000000000001, + 0.0000000000000001, + 0.00000000000000001, + 0.000000000000000001, + 0.0000000000000000001, + 0.00000000000000000001, + }; + + static inline constexpr double EXP_ARR[] = { + 1.0, + 10.0, + 100.0, + 1000.0, + 10000.0, + 100000.0, + 1000000.0, + 10000000.0, + 100000000.0, + 1000000000.0, + 10000000000.0, + 100000000000.0, + 1000000000000.0, + 10000000000000.0, + 100000000000000.0, + 1000000000000000.0, + 10000000000000000.0, + 100000000000000000.0, + 1000000000000000000.0, + 10000000000000000000.0, + 100000000000000000000.0, + 1000000000000000000000.0, + 10000000000000000000000.0, + 100000000000000000000000.0, + }; +}; + +inline constexpr int64_t FACT_ARR[] = {1, + 10, + 100, + 1000, + 10000, + 100000, + 1000000, + 10000000, + 100000000, + 1000000000, + 10000000000, + 100000000000, + 1000000000000, + 10000000000000, + 100000000000000, + 1000000000000000, + 10000000000000000, + 100000000000000000, + 1000000000000000000}; + +inline constexpr int64_t U_FACT_ARR[] = {1, + 10, + 100, + 1000, + 10000, + 100000, + 1000000, + 10000000, + 100000000, + 1000000000, + 10000000000, + 100000000000, + 1000000000000, + 10000000000000, + 100000000000000, + 1000000000000000, + 10000000000000000, + 100000000000000000, + 1000000000000000000}; + +alignas(64) inline constexpr uint64_t INDEX_ARR[1024] { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, + 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, + 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, + 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, + 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, + 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, + 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, + 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, + 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, + 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, + 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, + 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, + 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, + 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, + 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, + 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, + 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, + 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, + 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, + 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, + 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, + 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, + 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, + 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, + 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, + 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, + 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511, 512, + 513, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524, 525, 526, 527, 528, 529, 530, 531, + 532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546, 547, 548, 549, 550, + 551, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 562, 563, 564, 565, 566, 567, 568, 569, + 570, 571, 572, 573, 574, 575, 576, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 588, + 589, 590, 591, 592, 593, 594, 595, 596, 597, 598, 599, 600, 601, 602, 603, 604, 605, 606, 607, + 608, 609, 610, 611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623, 624, 625, 626, + 627, 628, 629, 630, 631, 632, 633, 634, 635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, + 646, 647, 648, 649, 650, 651, 652, 653, 654, 655, 656, 657, 658, 659, 660, 661, 662, 663, 664, + 665, 666, 667, 668, 669, 670, 671, 672, 673, 674, 675, 676, 677, 678, 679, 680, 681, 682, 683, + 684, 685, 686, 687, 688, 689, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699, 700, 701, 702, + 703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 714, 715, 716, 717, 718, 719, 720, 721, + 722, 723, 724, 725, 726, 727, 728, 729, 730, 731, 732, 733, 734, 735, 736, 737, 738, 739, 740, + 741, 742, 743, 744, 745, 746, 747, 748, 749, 750, 751, 752, 753, 754, 755, 756, 757, 758, 759, + 760, 761, 762, 763, 764, 765, 766, 767, 768, 769, 770, 771, 772, 773, 774, 775, 776, 777, 778, + 779, 780, 781, 782, 783, 784, 785, 786, 787, 788, 789, 790, 791, 792, 793, 794, 795, 796, 797, + 798, 799, 800, 801, 802, 803, 804, 805, 806, 807, 808, 809, 810, 811, 812, 813, 814, 815, 816, + 817, 818, 819, 820, 821, 822, 823, 824, 825, 826, 827, 828, 829, 830, 831, 832, 833, 834, 835, + 836, 837, 838, 839, 840, 841, 842, 843, 844, 845, 846, 847, 848, 849, 850, 851, 852, 853, 854, + 855, 856, 857, 858, 859, 860, 861, 862, 863, 864, 865, 866, 867, 868, 869, 870, 871, 872, 873, + 874, 875, 876, 877, 878, 879, 880, 881, 882, 883, 884, 885, 886, 887, 888, 889, 890, 891, 892, + 893, 894, 895, 896, 897, 898, 899, 900, 901, 902, 903, 904, 905, 906, 907, 908, 909, 910, 911, + 912, 913, 914, 915, 916, 917, 918, 919, 920, 921, 922, 923, 924, 925, 926, 927, 928, 929, 930, + 931, 932, 933, 934, 935, 936, 937, 938, 939, 940, 941, 942, 943, 944, 945, 946, 947, 948, 949, + 950, 951, 952, 953, 954, 955, 956, 957, 958, 959, 960, 961, 962, 963, 964, 965, 966, 967, 968, + 969, 970, 971, 972, 973, 974, 975, 976, 977, 978, 979, 980, 981, 982, 983, 984, 985, 986, 987, + 988, 989, 990, 991, 992, 993, 994, 995, 996, 997, 998, 999, 1000, 1001, 1002, 1003, 1004, 1005, 1006, + 1007, 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, +}; + +alignas(64) inline constexpr uint8_t LOOKUP_TABLE[256] { + 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, + 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, + 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, + 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, + 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, + 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, + 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8, +}; + +} // namespace alp + +#endif \ No newline at end of file diff --git a/include/alp/decode.hpp b/include/alp/decode.hpp new file mode 100644 index 0000000..db0a289 --- /dev/null +++ b/include/alp/decode.hpp @@ -0,0 +1,130 @@ +#ifndef ALP_DECODE_HPP +#define ALP_DECODE_HPP + +#include "common.hpp" +#include + +namespace alp { + +#ifdef AVX2 +#include "immintrin.h" + +// from: https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx +// Only works for inputs in the range: [-2^51, 2^51] +__m128i double_to_int64(__m128d x) { + x = _mm_add_pd(x, _mm_set1_pd(0x0018000000000000)); + return _mm_sub_epi64(_mm_castpd_si128(x), _mm_castpd_si128(_mm_set1_pd(0x0018000000000000))); +} + +// Only works for inputs in the range: [-2^51, 2^51] +__m128d int64_to_double(__m128i x) { + x = _mm_add_epi64(x, _mm_castpd_si128(_mm_set1_pd(0x0018000000000000))); + return _mm_sub_pd(_mm_castsi128_pd(x), _mm_set1_pd(0x0018000000000000)); +} + +/* + * scalar version of int64_to_double + */ +double int64_to_double(int64_t x) { + double magic_number = static_cast(0x0018000000000000); + x = x + static_cast(magic_number); + return static_cast(x) - static_cast(magic_number); +} + +// SSE version of int64_to_double +// Only works for inputs in the range: [-2^51, 2^51] +__m128d sse_int64_to_double(__m128i x) { + x = _mm_add_epi64(x, _mm_castpd_si128(_mm_set1_pd(0x0018000000000000))); + return _mm_sub_pd(_mm_castsi128_pd(x), _mm_set1_pd(0x0018000000000000)); +} + +__m256d int64_to_double_fast_precise(const __m256i v) +/* Optimized full range int64_t to double conversion */ +/* Emulate _mm256_cvtepi64_pd() */ +{ + __m256i magic_i_lo = _mm256_set1_epi64x(0x4330000000000000); /* 2^52 encoded as floating-point */ + __m256i magic_i_hi32 = _mm256_set1_epi64x(0x4530000080000000); /* 2^84 + 2^63 encoded as floating-point */ + __m256i magic_i_all = _mm256_set1_epi64x(0x4530000080100000); /* 2^84 + 2^63 + 2^52 encoded as floating-point */ + __m256d magic_d_all = _mm256_castsi256_pd(magic_i_all); + + __m256i v_lo = + _mm256_blend_epi32(magic_i_lo, v, 0b01010101); /* Blend the 32 lowest significant bits of v with magic_int_lo */ + __m256i v_hi = _mm256_srli_epi64(v, 32); /* Extract the 32 most significant bits of v */ + v_hi = _mm256_xor_si256(v_hi, magic_i_hi32); /* Flip the msb of v_hi and blend with 0x45300000 */ + __m256d v_hi_dbl = _mm256_sub_pd(_mm256_castsi256_pd(v_hi), magic_d_all); /* Compute in double precision: */ + __m256d result = _mm256_add_pd( + v_hi_dbl, + _mm256_castsi256_pd( + v_lo)); /* (v_hi - magic_d_all) + v_lo Do not assume associativity of floating point addition !! */ + return result; /* With gcc use -O3, then -fno-associative-math is default. Do not use -Ofast, which enables + -fassociative-math! */ +} + +void sse_decode(const int64_t* digits, uint8_t fac_idx, uint8_t exp_idx, double* out_p) { + uint64_t factor = alp::U_FACT_ARR[fac_idx]; + double frac10 = alp::Constants::FRAC_ARR[exp_idx]; + __m128i factor_sse = _mm_set1_epi64x(factor); + __m128d frac10_sse = _mm_set1_pd(frac10); + + auto digits_p = reinterpret_cast(digits); + + for (size_t i {0}; i < 512; ++i) { + __m128i digit = _mm_loadu_si128(digits_p + i); + __m128i tmp_int = digit * factor_sse; + __m128d tmp_dbl = sse_int64_to_double(tmp_int); + __m128d tmp_dbl_mlt = tmp_dbl * frac10_sse; + _mm_storeu_pd(out_p + (i * 2), tmp_dbl_mlt); + } +} + +void avx2_decode(const int64_t* digits, uint8_t fac_idx, uint8_t exp_idx, double* out_p) { + uint64_t factor = alp::U_FACT_ARR[fac_idx]; + double frac10 = alp::Constants::FRAC_ARR[exp_idx]; + __m256i factor_sse = _mm256_set1_epi64x(factor); + __m256d frac10_sse = _mm256_set1_pd(frac10); + + auto digits_p = reinterpret_cast(digits); + + for (size_t i {0}; i < 256; ++i) { + __m256i digit = _mm256_loadu_si256(digits_p + i); + __m256i tmp_int = digit * factor_sse; + __m256d tmp_dbl = int64_to_double_fast_precise(tmp_int); + __m256d tmp_dbl_mlt = tmp_dbl * frac10_sse; + _mm256_storeu_pd(out_p + (i * 4), tmp_dbl_mlt); + } +} + +#endif + +template +struct AlpDecode { + + //! Scalar decoding a single value with ALP + static inline T decode_value(const int64_t encoded_value, const uint8_t factor, const uint8_t exponent) { + const T decoded_value = encoded_value * FACT_ARR[factor] * alp::Constants::FRAC_ARR[exponent]; + return decoded_value; + } + + //! Scalar decoding of an ALP vector + static inline void + decode(const int64_t* encoded_integers, const uint8_t fac_idx, const uint8_t exp_idx, T* output) { + for (size_t i {0}; i < config::VECTOR_SIZE; i++) { + output[i] = decode_value(encoded_integers[i], fac_idx, exp_idx); + } + } + + //! Patch Exceptions + static inline void patch_exceptions(T* out, + const T* exceptions, + const exp_p_t* exceptions_positions, + const exp_c_t* exceptions_count) { + const auto exp_c = exceptions_count[0]; + for (exp_c_t i {0}; i < exp_c; i++) { + out[exceptions_positions[i]] = exceptions[i]; + } + } +}; + +} // namespace alp + +#endif // ALP_DECODE_HPP diff --git a/include/alp/decompressor.hpp b/include/alp/decompressor.hpp new file mode 100644 index 0000000..ace19e2 --- /dev/null +++ b/include/alp/decompressor.hpp @@ -0,0 +1,154 @@ +#ifndef ALP_DECOMPRESSOR_HPP +#define ALP_DECOMPRESSOR_HPP + +#include "alp/decode.hpp" +#include "alp/storer.hpp" +#include "alp/utils.hpp" +#include "fastlanes/unffor.hpp" + +namespace alp { + +/* + * API Decompressor + */ +template +struct AlpDecompressor { + + using EXACT_TYPE = typename FloatingToExact::type; + + state stt; + storer::MemReader reader; + + size_t out_offset = 0; + + T exceptions[config::VECTOR_SIZE]; + int64_t encoded_integers[config::VECTOR_SIZE]; + int64_t alp_encoded_array[config::VECTOR_SIZE]; + uint16_t exceptions_rd[config::VECTOR_SIZE]; + uint16_t exceptions_position[config::VECTOR_SIZE]; + + // 'right' & 'left' refer to the respective parts of the floating numbers after splitting + uint64_t alp_bp_size; + uint64_t left_bp_size; + uint64_t right_bp_size; + EXACT_TYPE right_parts[config::VECTOR_SIZE]; + EXACT_TYPE right_parts_encoded[config::VECTOR_SIZE]; + uint16_t left_parts_encoded[config::VECTOR_SIZE]; + uint16_t left_parts[config::VECTOR_SIZE]; + + EXACT_TYPE right_for_base = 0; // Always 0 + + AlpDecompressor() { + + }; + + void load_rd_metadata() { + reader.read(&stt.right_bit_width, sizeof(stt.right_bit_width)); + reader.read(&stt.left_bit_width, sizeof(stt.left_bit_width)); + left_bp_size = AlpApiUtils::get_size_after_bitpacking(stt.left_bit_width); + right_bp_size = AlpApiUtils::get_size_after_bitpacking(stt.right_bit_width); + + reader.read(&stt.actual_dictionary_size, sizeof(stt.actual_dictionary_size)); + uint8_t actual_dictionary_size_bytes = stt.actual_dictionary_size * DICTIONARY_ELEMENT_SIZE_BYTES; + + reader.read(stt.left_parts_dict, actual_dictionary_size_bytes); + } + + void load_alprd_vector() { + reader.read(&stt.exceptions_count, sizeof(stt.exceptions_count)); + reader.read(left_parts_encoded, left_bp_size); + reader.read(right_parts_encoded, right_bp_size); + if (stt.exceptions_count) { + reader.read(exceptions_rd, RD_EXCEPTION_SIZE_BYTES * stt.exceptions_count); + reader.read(exceptions_position, RD_EXCEPTION_POSITION_SIZE_BYTES * stt.exceptions_count); + } + } + + void load_alp_vector() { + reader.read(&stt.exp, sizeof(stt.exp)); + reader.read(&stt.fac, sizeof(stt.fac)); + reader.read(&stt.exceptions_count, sizeof(stt.exceptions_count)); + reader.read(&stt.for_base, sizeof(stt.for_base)); + reader.read(&stt.bit_width, sizeof(stt.bit_width)); + + if (stt.bit_width > 0) { + alp_bp_size = AlpApiUtils::get_size_after_bitpacking(stt.bit_width); + reader.read(alp_encoded_array, alp_bp_size); + } + + if (stt.exceptions_count > 0) { + reader.read(exceptions, Constants::EXCEPTION_SIZE_BYTES * stt.exceptions_count); + reader.read(exceptions_position, EXCEPTION_POSITION_SIZE_BYTES * stt.exceptions_count); + } + } + + void decompress_vector(T* out) { + if (stt.scheme == SCHEME::ALP_RD) { + unffor::unffor(right_parts_encoded, right_parts, stt.right_bit_width, &right_for_base); + unffor::unffor(left_parts_encoded, left_parts, stt.left_bit_width, &stt.left_for_base); + AlpRD::decode((out + out_offset), + right_parts, + left_parts, + exceptions_rd, + exceptions_position, + &stt.exceptions_count, + stt); + } else { + unffor::unffor(alp_encoded_array, encoded_integers, stt.bit_width, &stt.for_base); + AlpDecode::decode(encoded_integers, stt.fac, stt.exp, (out + out_offset)); + AlpDecode::patch_exceptions((out + out_offset), exceptions, exceptions_position, &stt.exceptions_count); + } + } + + void load_vector() { + if (stt.scheme == SCHEME::ALP_RD) { + load_alprd_vector(); + } else { + load_alp_vector(); + } + } + + SCHEME load_rowgroup_metadata() { + uint8_t scheme_id; + reader.read(&scheme_id, sizeof(scheme_id)); + + SCHEME used_scheme = SCHEME(scheme_id); + if (used_scheme == SCHEME::ALP_RD) { load_rd_metadata(); } + + return used_scheme; + } + + void decompress(uint8_t* in, size_t values_count, T* out) { + reader = storer::MemReader(in); + size_t rouwgroup_count = AlpApiUtils::get_rowgroup_count(values_count); + size_t left_to_decompress = values_count; + for (size_t current_rowgroup = 0; current_rowgroup < rouwgroup_count; current_rowgroup++) { + /* + * Rowgroup level + */ + stt.scheme = load_rowgroup_metadata(); + + size_t values_left_in_rowgroup = std::min(config::ROWGROUP_SIZE, left_to_decompress); + size_t vectors_in_rowgroup = AlpApiUtils::get_complete_vector_count(values_left_in_rowgroup); + + for (size_t vector_idx = 0; vector_idx < vectors_in_rowgroup; vector_idx++) { + /* + * Vector level + */ + size_t next_vector_count = std::min(config::VECTOR_SIZE, left_to_decompress); + load_vector(); + decompress_vector(out); + out_offset += next_vector_count; + left_to_decompress -= next_vector_count; + } + } + if (left_to_decompress) { + load_vector(); + decompress_vector(out); + } + }; +}; + +} // namespace alp + +#endif \ No newline at end of file diff --git a/include/alp/encode.hpp b/include/alp/encode.hpp new file mode 100644 index 0000000..a8f3c19 --- /dev/null +++ b/include/alp/encode.hpp @@ -0,0 +1,448 @@ +#ifndef ALP_ENCODE_HPP +#define ALP_ENCODE_HPP + +#include "alp/config.hpp" +#include "alp/constants.hpp" +#include "alp/decode.hpp" +#include "alp/sampler.hpp" +#include "alp/state.hpp" +#include "common.hpp" +#include +#include +#include +#include +#include +#include +#include + +#ifdef __AVX2__ + +#include + +#endif + +/* + * ALP Encoding + */ +namespace alp { + +template +struct AlpEncode { + + using EXACT_TYPE = typename FloatingToExact::type; + static constexpr uint8_t EXACT_TYPE_BITSIZE = sizeof(EXACT_TYPE) * 8; + + /* + * Check for special values which are impossible for ALP to encode + * because they cannot be cast to int64 without an undefined behaviour + */ + static inline bool is_impossible_to_encode(const T n) { + return !std::isfinite(n) || std::isnan(n) || n > ENCODING_UPPER_LIMIT || n < ENCODING_LOWER_LIMIT || + (n == 0.0 && std::signbit(n)); //! Verification for -0.0 + } + + //! Scalar encoding a single value with ALP + template + static int64_t encode_value(const T value, const factor_idx_t factor_idx, const exponent_idx_t exponent_idx) { + T tmp_encoded_value = value * Constants::EXP_ARR[exponent_idx] * Constants::FRAC_ARR[factor_idx]; + if constexpr (SAFE) { + if (is_impossible_to_encode(tmp_encoded_value)) { return ENCODING_UPPER_LIMIT; } + } + tmp_encoded_value = tmp_encoded_value + Constants::MAGIC_NUMBER - Constants::MAGIC_NUMBER; + return static_cast(tmp_encoded_value); + } + + //! Analyze FFOR to obtain bitwidth and frame-of-reference value + static inline void analyze_ffor(const int64_t* input_vector, bw_t& bit_width, int64_t* base_for) { + auto min = std::numeric_limits::max(); + auto max = std::numeric_limits::min(); + + for (size_t i {0}; i < config::VECTOR_SIZE; i++) { + if (input_vector[i] < min) { min = input_vector[i]; } + if (input_vector[i] > max) { max = input_vector[i]; } + } + + const auto delta = (static_cast(max) - static_cast(min)); + const auto estimated_bits_per_value = static_cast(ceil(log2(delta + 1))); + bit_width = estimated_bits_per_value; + base_for[0] = min; + } + + /* + * Function to sort the best combinations from each vector sampled from the rowgroup + * First criteria is number of times it appears + * Second criteria is bigger exponent + * Third criteria is bigger factor + */ + static inline bool compare_best_combinations(const std::pair, int>& t1, + const std::pair, int>& t2) { + return (t1.second > t2.second) || (t1.second == t2.second && (t2.first.first < t1.first.first)) || + ((t1.second == t2.second && t2.first.first == t1.first.first) && (t2.first.second < t1.first.second)); + } + + /* + * Find the best combinations of factor-exponent from each vector sampled from a rowgroup + * This function is called once per rowgroup + * This operates over ALP first level samples + */ + static inline void find_top_k_combinations(const T* smp_arr, state& stt) { + const auto n_vectors_to_sample = + static_cast(std::ceil(static_cast(stt.sampled_values_n) / config::SAMPLES_PER_VECTOR)); + const uint64_t samples_size = std::min(stt.sampled_values_n, config::SAMPLES_PER_VECTOR); + std::map, int> global_combinations; + uint64_t smp_offset {0}; + + // For each vector in the rg sample + size_t best_estimated_compression_size { + (samples_size * (Constants::EXCEPTION_SIZE + EXCEPTION_POSITION_SIZE)) + + (samples_size * (Constants::EXCEPTION_SIZE))}; + for (size_t smp_n = 0; smp_n < n_vectors_to_sample; smp_n++) { + uint8_t found_factor {0}; + uint8_t found_exponent {0}; + // We start our optimization with the worst possible total bits obtained from compression + uint64_t sample_estimated_compression_size { + (samples_size * (Constants::EXCEPTION_SIZE + EXCEPTION_POSITION_SIZE)) + + (samples_size * (Constants::EXCEPTION_SIZE))}; // worst scenario + + // We try all combinations in search for the one which minimize the compression size + for (int8_t exp_ref = Constants::MAX_EXPONENT; exp_ref >= 0; exp_ref--) { + for (int8_t factor_idx = exp_ref; factor_idx >= 0; factor_idx--) { + uint16_t exceptions_count = {0}; + uint16_t non_exceptions_count = {0}; + uint32_t estimated_bits_per_value = {0}; + uint64_t estimated_compression_size = {0}; + int64_t max_encoded_value = {std::numeric_limits::min()}; + int64_t min_encoded_value = {std::numeric_limits::max()}; + + for (size_t i = 0; i < samples_size; i++) { + const T actual_value = smp_arr[smp_offset + i]; + const int64_t encoded_value = encode_value(actual_value, factor_idx, exp_ref); + const T decoded_value = AlpDecode::decode_value(encoded_value, factor_idx, exp_ref); + if (decoded_value == actual_value) { + non_exceptions_count++; + if (encoded_value > max_encoded_value) { max_encoded_value = encoded_value; } + if (encoded_value < min_encoded_value) { min_encoded_value = encoded_value; } + } else { + exceptions_count++; + } + } + + // We do not take into account combinations which yield to almsot all exceptions + if (non_exceptions_count < 2) { continue; } + + // Evaluate factor/exponent compression size (we optimize for FOR) + const uint64_t delta = + (static_cast(max_encoded_value) - static_cast(min_encoded_value)); + estimated_bits_per_value = std::ceil(std::log2(delta + 1)); + estimated_compression_size += samples_size * estimated_bits_per_value; + estimated_compression_size += + exceptions_count * (Constants::EXCEPTION_SIZE + EXCEPTION_POSITION_SIZE); + + if ((estimated_compression_size < sample_estimated_compression_size) || + (estimated_compression_size == sample_estimated_compression_size && + (found_exponent < exp_ref)) || + // We prefer bigger exponents + ((estimated_compression_size == sample_estimated_compression_size && + found_exponent == exp_ref) && + (found_factor < factor_idx)) // We prefer bigger factors + ) { + sample_estimated_compression_size = estimated_compression_size; + found_exponent = exp_ref; + found_factor = factor_idx; + if (sample_estimated_compression_size < best_estimated_compression_size) { + best_estimated_compression_size = sample_estimated_compression_size; + } + } + } + } + std::pair cmb = std::make_pair(found_exponent, found_factor); + global_combinations[cmb]++; + smp_offset += samples_size; + } + + // We adapt scheme if we were not able to achieve compression in the current rg + if (best_estimated_compression_size >= Constants::RD_SIZE_THRESHOLD_LIMIT) { + stt.scheme = SCHEME::ALP_RD; + return; + } + + // Convert our hash to a Combination vector to be able to sort + // Note that this vector is always small (< 10 combinations) + std::vector, int>> best_k_combinations; + best_k_combinations.reserve(global_combinations.size()); + for (auto const& itr : global_combinations) { + best_k_combinations.emplace_back(itr.first, // Pair exp, fac + itr.second // N of times it appeared + ); + } + // We sort combinations based on times they appeared + std::sort(best_k_combinations.begin(), best_k_combinations.end(), compare_best_combinations); + if (best_k_combinations.size() < stt.k_combinations) { stt.k_combinations = best_k_combinations.size(); } + + // Save k' best exp, fac combination pairs + for (size_t i {0}; i < stt.k_combinations; i++) { + stt.best_k_combinations.push_back(best_k_combinations[i].first); + } + } + + /* + * Find the best combination of factor-exponent for a vector from within the best k combinations + * This is ALP second level sampling + */ + static inline void + find_best_exponent_factor_from_combinations(const std::vector>& top_combinations, + const uint8_t top_k, + const T* input_vector, + const uint16_t input_vector_size, + uint8_t& factor, + uint8_t& exponent) { + uint8_t found_exponent {0}; + uint8_t found_factor {0}; + uint64_t best_estimated_compression_size {0}; + uint8_t worse_threshold_count {0}; + + const int32_t sample_increments = + std::max(1, static_cast(std::ceil(input_vector_size / config::SAMPLES_PER_VECTOR))); + + // We try each K combination in search for the one which minimize the compression size in the vector + for (size_t k {0}; k < top_k; k++) { + const int exp_idx = top_combinations[k].first; + const int factor_idx = top_combinations[k].second; + uint32_t exception_count {0}; + uint32_t estimated_bits_per_value {0}; + uint64_t estimated_compression_size {0}; + int64_t max_encoded_value {std::numeric_limits::min()}; + int64_t min_encoded_value {std::numeric_limits::max()}; + + for (size_t sample_idx = 0; sample_idx < input_vector_size; sample_idx += sample_increments) { + const T actual_value = input_vector[sample_idx]; + const int64_t encoded_value = encode_value(actual_value, factor_idx, exp_idx); + const T decoded_value = AlpDecode::decode_value(encoded_value, factor_idx, exp_idx); + if (decoded_value == actual_value) { + if (encoded_value > max_encoded_value) { max_encoded_value = encoded_value; } + if (encoded_value < min_encoded_value) { min_encoded_value = encoded_value; } + } else { + exception_count++; + } + } + + // Evaluate factor/exponent performance (we optimize for FOR) + const uint64_t delta = max_encoded_value - min_encoded_value; + estimated_bits_per_value = ceil(log2(delta + 1)); + estimated_compression_size += config::SAMPLES_PER_VECTOR * estimated_bits_per_value; + estimated_compression_size += exception_count * (Constants::EXCEPTION_SIZE + EXCEPTION_POSITION_SIZE); + + if (k == 0) { // First try with first combination + best_estimated_compression_size = estimated_compression_size; + found_factor = factor_idx; + found_exponent = exp_idx; + continue; // Go to second + } + if (estimated_compression_size >= + best_estimated_compression_size) { // If current is worse or equal than previous + worse_threshold_count += 1; + if (worse_threshold_count == SAMPLING_EARLY_EXIT_THRESHOLD) { + break; // We stop only if two are worse + } + continue; + } + // Otherwise we replace best and continue with next + best_estimated_compression_size = estimated_compression_size; + found_factor = factor_idx; + found_exponent = exp_idx; + worse_threshold_count = 0; + } + exponent = found_exponent; + factor = found_factor; + } + + // DOUBLE + static inline void encode_simdized(const double* input_vector, + double* exceptions, + exp_p_t* exceptions_positions, + exp_c_t* exceptions_count, + int64_t* encoded_integers, + const factor_idx_t factor_idx, + const exponent_idx_t exponent_idx) { + alignas(64) static double encoded_dbl_arr[1024]; + alignas(64) static double dbl_arr_without_specials[1024]; + alignas(64) static uint64_t INDEX_ARR[1024]; + + exp_p_t current_exceptions_count {0}; + uint64_t exceptions_idx {0}; + + // make copy of input with all special values replaced by ENCODING_UPPER_LIMIT + const auto* tmp_input = reinterpret_cast(input_vector); + for (size_t i {0}; i < config::VECTOR_SIZE; i++) { + const auto is_special = + ((tmp_input[i] & 0x7FFFFFFFFFFFFFFF) >= + 0x7FF0000000000000) // any NaN, +inf and -inf (https://stackoverflow.com/questions/29730530/) + || tmp_input[i] == Constants::NEGATIVE_ZERO; + + if (is_special) { + dbl_arr_without_specials[i] = ENCODING_UPPER_LIMIT; + } else { + dbl_arr_without_specials[i] = input_vector[i]; + } + } + +#pragma clang loop vectorize_width(64) + for (size_t i {0}; i < config::VECTOR_SIZE; i++) { + auto const actual_value = dbl_arr_without_specials[i]; + + // Attempt conversion + const int64_t encoded_value = encode_value(actual_value, factor_idx, exponent_idx); + encoded_integers[i] = encoded_value; + const double decoded_value = AlpDecode::decode_value(encoded_value, factor_idx, exponent_idx); + encoded_dbl_arr[i] = decoded_value; + } + +#ifdef __AVX512F__ + for (size_t i {0}; i < config::VECTOR_SIZE; i = i + 8) { + __m512d l = _mm512_loadu_pd(tmp_dbl_arr + i); + __m512d r = _mm512_loadu_pd(input_vector + i); + __m512i index = _mm512_loadu_pd(INDEX_ARR + i); + auto is_exception = _mm512_cmpneq_pd_mask(l, r); + _mm512_mask_compressstoreu_pd(tmp_index + exceptions_idx, is_exception, index); + exceptions_idx += LOOKUP_TABLE[is_exception]; + } +#else + for (size_t i {0}; i < config::VECTOR_SIZE; i++) { + auto l = encoded_dbl_arr[i]; + auto r = dbl_arr_without_specials[i]; + auto is_exception = (l != r); + INDEX_ARR[exceptions_idx] = i; + exceptions_idx += is_exception; + } +#endif + + int64_t a_non_exception_value = 0; + for (size_t i {0}; i < config::VECTOR_SIZE; i++) { + if (i != INDEX_ARR[i]) { + a_non_exception_value = encoded_integers[i]; + break; + } + } + + for (size_t j {0}; j < exceptions_idx; j++) { + size_t i = INDEX_ARR[j]; + const auto actual_value = input_vector[i]; + encoded_integers[i] = a_non_exception_value; + exceptions[current_exceptions_count] = actual_value; + exceptions_positions[current_exceptions_count] = i; + current_exceptions_count = current_exceptions_count + 1; + } + + *exceptions_count = current_exceptions_count; + } + + // FLOAT + static inline void encode_simdized(const float* input_vector, + float* exceptions, + exp_p_t* exceptions_positions, + exp_c_t* exceptions_count, + int64_t* encoded_integers, + const factor_idx_t factor_idx, + const exponent_idx_t exponent_idx) { + alignas(64) static float encoded_dbl_arr[1024]; + alignas(64) static float dbl_arr_without_specials[1024]; + alignas(64) static uint64_t INDEX_ARR[1024]; + + exp_p_t current_exceptions_count {0}; + uint64_t exceptions_idx {0}; + + // make copy of input with all special values replaced by ENCODING_UPPER_LIMIT + const auto* tmp_input = reinterpret_cast(input_vector); + for (size_t i {0}; i < config::VECTOR_SIZE; i++) { + const auto is_special = + ((tmp_input[i] & 0x7FFFFFFF) >= + 0x7F800000) // any NaN, +inf and -inf (https://stackoverflow.com/questions/29730530/) + || tmp_input[i] == Constants::NEGATIVE_ZERO; + + if (is_special) { + dbl_arr_without_specials[i] = ENCODING_UPPER_LIMIT; + } else { + dbl_arr_without_specials[i] = input_vector[i]; + } + } + +#pragma clang loop vectorize_width(64) + for (size_t i {0}; i < config::VECTOR_SIZE; i++) { + auto const actual_value = dbl_arr_without_specials[i]; + + // Attempt conversion + const int64_t encoded_value = encode_value(actual_value, factor_idx, exponent_idx); + encoded_integers[i] = encoded_value; + const float decoded_value = AlpDecode::decode_value(encoded_value, factor_idx, exponent_idx); + encoded_dbl_arr[i] = decoded_value; + } + +#ifdef __AVX512F__ + for (size_t i {0}; i < config::VECTOR_SIZE; i = i + 16) { + __m512 l = _mm512_loadu_ps(tmp_dbl_arr + i); + __m512 r = _mm512_loadu_ps(input_vector + i); + __m512i index = _mm512_loadu_ps(INDEX_ARR + i); + auto is_exception = _mm512_cmpneq_ps_mask(l, r); + _mm512_mask_compressstoreu_ps(tmp_index + exceptions_idx, is_exception, index); + exceptions_idx += LOOKUP_TABLE[is_exception]; + } +#else + for (size_t i {0}; i < config::VECTOR_SIZE; i++) { + auto l = encoded_dbl_arr[i]; + auto r = dbl_arr_without_specials[i]; + auto is_exception = (l != r); + INDEX_ARR[exceptions_idx] = i; + exceptions_idx += is_exception; + } +#endif + + int64_t a_non_exception_value = 0; + for (size_t i {0}; i < config::VECTOR_SIZE; i++) { + if (i != INDEX_ARR[i]) { + a_non_exception_value = encoded_integers[i]; + break; + } + } + + for (size_t j {0}; j < exceptions_idx; j++) { + size_t i = INDEX_ARR[j]; + const auto actual_value = input_vector[i]; + encoded_integers[i] = a_non_exception_value; + exceptions[current_exceptions_count] = actual_value; + exceptions_positions[current_exceptions_count] = i; + current_exceptions_count = current_exceptions_count + 1; + } + + *exceptions_count = current_exceptions_count; + } + + static inline void encode(const T* input_vector, + T* exceptions, + uint16_t* exceptions_positions, + uint16_t* exceptions_count, + int64_t* encoded_integers, + state& stt) { + + if (stt.k_combinations > 1) { // Only if more than 1 found top combinations we sample and search + find_best_exponent_factor_from_combinations( + stt.best_k_combinations, stt.k_combinations, input_vector, stt.vector_size, stt.fac, stt.exp); + } else { + stt.exp = stt.best_k_combinations[0].first; + stt.fac = stt.best_k_combinations[0].second; + } + encode_simdized( + input_vector, exceptions, exceptions_positions, exceptions_count, encoded_integers, stt.fac, stt.exp); + } + + static inline void + init(const T* data_column, const size_t column_offset, const size_t tuples_count, T* sample_arr, state& stt) { + stt.scheme = SCHEME::ALP; + stt.sampled_values_n = sampler::first_level_sample(data_column, column_offset, tuples_count, sample_arr); + stt.k_combinations = config::MAX_K_COMBINATIONS; + stt.best_k_combinations.clear(); + find_top_k_combinations(sample_arr, stt); + } +}; + +} // namespace alp +#endif diff --git a/include/alp/falp.hpp b/include/alp/falp.hpp new file mode 100644 index 0000000..872f522 --- /dev/null +++ b/include/alp/falp.hpp @@ -0,0 +1,111 @@ +/* +-- DATE : 18/04/2024 +-- FILE_PATH : include/alp/falp.hpp +-- PROJECT_NAME : ALP +*/ + +#ifndef ALP_FALP_HPP +#define ALP_FALP_HPP + +#include + +namespace generated { namespace falp { +namespace fallback { +namespace scalar { +void falp(const uint64_t* __restrict in, + double* __restrict out, + uint8_t bw, + const uint64_t* __restrict a_base_p, + uint8_t factor, + uint8_t exponent); + +} // namespace scalar +namespace unit64 { +void falp(const uint64_t* __restrict in, + double* __restrict out, + uint8_t bw, + const uint64_t* __restrict a_base_p, + uint8_t factor, + uint8_t exponent); + +} // namespace unit64 +} // namespace fallback + +namespace helper { namespace scalar { +void falp(const uint64_t* __restrict in, + double* __restrict out, + uint8_t bw, + const uint64_t* __restrict a_base_p, + uint8_t factor, + uint8_t exponent); +}} // namespace helper::scalar + +namespace x86_64 { +namespace sse { +void falp(const uint64_t* __restrict in, + double* __restrict out, + uint8_t bw, + const uint64_t* __restrict a_base_p, + uint8_t factor, + uint8_t exponent); +} // namespace sse + +namespace avx2 { +void falp(const uint64_t* __restrict in, + double* __restrict out, + uint8_t bw, + const uint64_t* __restrict a_base_p, + uint8_t factor, + uint8_t exponent); +} // namespace avx2 + +namespace avx512f { +void falp(const uint64_t* __restrict in, + double* __restrict out, + uint8_t bw, + const uint64_t* __restrict a_base_p, + uint8_t factor, + uint8_t exponent); +} + +namespace avx512bw { +void falp(const uint64_t* __restrict in, + double* __restrict out, + uint8_t bw, + const uint64_t* __restrict a_base_p, + uint8_t factor, + uint8_t exponent); +} // namespace avx512bw + +} // namespace x86_64 +namespace wasm { namespace simd128 { +void falp(const uint64_t* __restrict in, + double* __restrict out, + uint8_t bw, + const uint64_t* __restrict a_base_p, + uint8_t factor, + uint8_t exponent); +}} // namespace wasm::simd128 + +namespace arm64v8 { +namespace neon { +void falp(const uint64_t* __restrict in, + double* __restrict out, + uint8_t bw, + const uint64_t* __restrict a_base_p, + uint8_t factor, + uint8_t exponent); +} // namespace neon + +namespace sve { +void falp(const uint64_t* __restrict in, + double* __restrict out, + uint8_t bw, + const uint64_t* __restrict a_base_p, + uint8_t factor, + uint8_t exponent); +} // namespace sve +} // namespace arm64v8 +}} // namespace generated::falp + +#endif // FALP_HPP diff --git a/include/alp/rd.hpp b/include/alp/rd.hpp new file mode 100644 index 0000000..a887ea3 --- /dev/null +++ b/include/alp/rd.hpp @@ -0,0 +1,183 @@ +#ifndef ALP_CUTTER_HPP +#define ALP_CUTTER_HPP + +#include "alp/common.hpp" +#include "alp/constants.hpp" +#include "alp/encode.hpp" +#include "alp/sampler.hpp" +#include + +namespace alp { + +template +struct AlpRD { + + using EXACT_TYPE = typename FloatingToExact::type; + static constexpr uint8_t EXACT_TYPE_BITSIZE = sizeof(EXACT_TYPE) * 8; + + //! Estimate the bits per value of ALPRD within a sample + static inline double estimate_compression_size(const bw_t right_bit_width, + const bw_t left_bit_width, + const exp_c_t exceptions_count, + const uint64_t sample_count) { + const double exceptions_size = exceptions_count * (RD_EXCEPTION_POSITION_SIZE + RD_EXCEPTION_SIZE); + const double estimated_size = right_bit_width + left_bit_width + (exceptions_size / sample_count); + return estimated_size; + } + + template + static double build_left_parts_dictionary(const T* in_p, bw_t right_bit_width, state& stt) { + std::unordered_map left_parts_hash; + std::vector> left_parts_sorted_repetitions; + + auto* in = reinterpret_cast(in_p); + // Building a hash for all the left parts and how many times they appear + for (size_t i = 0; i < stt.sampled_values_n; i++) { + auto left_tmp = in[i] >> right_bit_width; + left_parts_hash[left_tmp]++; + } + + // We build a vector from the hash to be able to sort it by repetition count + left_parts_sorted_repetitions.reserve(left_parts_hash.size()); + for (auto& pair : left_parts_hash) { + left_parts_sorted_repetitions.emplace_back(pair.second, pair.first); + } + std::sort(left_parts_sorted_repetitions.begin(), + left_parts_sorted_repetitions.end(), + [](const std::pair& a, const std::pair& b) { + return a.first > b.first; + }); + + // Exceptions are left parts which do not fit in the fixed dictionary size + uint32_t exceptions_count {0}; + for (size_t i {config::MAX_RD_DICTIONARY_SIZE}; i < left_parts_sorted_repetitions.size(); i++) { + exceptions_count += left_parts_sorted_repetitions[i].first; + } + + // The left parts bit width after compression is determined by how many elements are in the dictionary + uint8_t actual_dictionary_size = + std::min(config::MAX_RD_DICTIONARY_SIZE, left_parts_sorted_repetitions.size()); + bw_t left_bit_width = std::max(1, std::ceil(std::log2(actual_dictionary_size))); + + if (PERSIST_DICT) { + stt.left_parts_dict_map.clear(); + for (size_t dict_idx = 0; dict_idx < actual_dictionary_size; dict_idx++) { + //! The left_parts_dict keys are mapped to the left part themselves + stt.left_parts_dict[dict_idx] = left_parts_sorted_repetitions[dict_idx].second; // .hash + stt.left_parts_dict_map.insert({stt.left_parts_dict[dict_idx], dict_idx}); + } + //! Pararelly we store a map of the dictionary to quickly resolve exceptions during encoding + for (size_t i = actual_dictionary_size + 1; i < left_parts_sorted_repetitions.size(); i++) { + stt.left_parts_dict_map.insert({left_parts_sorted_repetitions[i].second, i}); // .hash + } + stt.left_bit_width = left_bit_width; + stt.right_bit_width = right_bit_width; + stt.actual_dictionary_size = actual_dictionary_size; + stt.actual_dictionary_size_bytes = actual_dictionary_size * DICTIONARY_ELEMENT_SIZE_BYTES; + } + + double estimated_size = + estimate_compression_size(right_bit_width, left_bit_width, exceptions_count, stt.sampled_values_n); + return estimated_size; + } + + static inline void find_best_dictionary(T* smp_arr, state& stt) { + bw_t right_bit_width {0}; + double best_dict_size = std::numeric_limits::max(); + + // Finding the best position to CUT the values + for (size_t i {1}; i <= config::CUTTING_LIMIT; i++) { + bw_t candidate_right_bit_width = EXACT_TYPE_BITSIZE - i; + const double estimated_size = build_left_parts_dictionary(smp_arr, candidate_right_bit_width, stt); + if (estimated_size < best_dict_size) { + right_bit_width = candidate_right_bit_width; + best_dict_size = estimated_size; + } + // TODO: We can implement an early exit mechanism similar to normal ALP + } + build_left_parts_dictionary(smp_arr, right_bit_width, stt); + } + + /* + * ALP RD Encode + */ + static inline void encode(const T* dbl_arr, + uint16_t* exceptions, + uint16_t* exception_positions, + uint16_t* exceptions_count_p, + EXACT_TYPE* right_parts, + uint16_t* left_parts, + state& stt) { + const EXACT_TYPE* in = reinterpret_cast(dbl_arr); + + // Cutting the floating point values + for (size_t i {0}; i < config::VECTOR_SIZE; ++i) { + EXACT_TYPE tmp = in[i]; + right_parts[i] = tmp & ((1ULL << stt.right_bit_width) - 1); + left_parts[i] = (tmp >> stt.right_bit_width); + } + + uint16_t exceptions_count {0}; + // Dictionary encoding for left parts + for (size_t i {0}; i < config::VECTOR_SIZE; i++) { + uint16_t dictionary_index; + auto dictionary_key = left_parts[i]; + if (stt.left_parts_dict_map.find(dictionary_key) == stt.left_parts_dict_map.end()) { + // If not found on the dictionary we store the smallest non-key index as exception (the dict size) + dictionary_index = stt.actual_dictionary_size; + } else { + dictionary_index = stt.left_parts_dict_map[dictionary_key]; + } + left_parts[i] = dictionary_index; + + //! Left parts not found in the dictionary are stored as exceptions + if (dictionary_index >= stt.actual_dictionary_size) { + exceptions[exceptions_count] = dictionary_key; + exception_positions[exceptions_count] = i; + exceptions_count++; + } + } + stt.exceptions_count = exceptions_count; + exceptions_count_p[0] = exceptions_count; + } + + /* + * ALP RD Decode + */ + static inline void decode(T* a_out, + EXACT_TYPE* unffor_right_arr, + uint16_t* unffor_left_arr, + uint16_t* exceptions, + uint16_t* exceptions_positions, + uint16_t* exceptions_count, + state& stt) { + + EXACT_TYPE* out = reinterpret_cast(a_out); + auto* right_parts = unffor_right_arr; + auto* left_parts = unffor_left_arr; + + // Decoding + for (size_t i = 0; i < config::VECTOR_SIZE; i++) { + uint16_t left = stt.left_parts_dict[left_parts[i]]; + EXACT_TYPE right = right_parts[i]; + out[i] = (static_cast(left) << stt.right_bit_width) | right; + } + + // Exceptions Patching (exceptions only occur in left parts) + auto exp_c = exceptions_count[0]; + for (size_t i = 0; i < exp_c; i++) { + EXACT_TYPE right = right_parts[exceptions_positions[i]]; + uint16_t left = exceptions[i]; + out[exceptions_positions[i]] = (static_cast(left) << stt.right_bit_width) | right; + } + } + + static inline void init(T* data_column, size_t column_offset, size_t tuples_count, T* sample_arr, state& stt) { + stt.scheme = SCHEME::ALP_RD; + stt.sampled_values_n = sampler::first_level_sample(data_column, column_offset, tuples_count, sample_arr); + find_best_dictionary(sample_arr, stt); + } +}; + +} // namespace alp +#endif // BENCH_ALP_CUTTER_H diff --git a/include/alp/sampler.hpp b/include/alp/sampler.hpp new file mode 100644 index 0000000..88d2d60 --- /dev/null +++ b/include/alp/sampler.hpp @@ -0,0 +1,52 @@ +#ifndef ALP_SAMPLER_HPP +#define ALP_SAMPLER_HPP + +#include "alp/config.hpp" +#include +#include + +namespace alp::sampler { + +template +inline size_t first_level_sample(const T* data, const size_t data_offset, const size_t data_size, T* data_sample) { + const size_t left_in_data = data_size - data_offset; + const size_t portion_to_sample = std::min(config::ROWGROUP_SIZE, left_in_data); + const size_t available_alp_vectors = std::ceil(static_cast(portion_to_sample) / config::VECTOR_SIZE); + size_t sample_idx = 0; + size_t data_idx = data_offset; + + for (size_t vector_idx = 0; vector_idx < available_alp_vectors; vector_idx++) { + const size_t current_vector_n_values = std::min(data_size - data_idx, config::VECTOR_SIZE); + + //! We sample equidistant vectors; to do this we skip a fixed values of vectors + //! If we are not in the correct jump, we do not take sample from this vector + if (const bool must_select_rowgroup_sample = (vector_idx % config::ROWGROUP_SAMPLES_JUMP) == 0; + !must_select_rowgroup_sample) { + data_idx += current_vector_n_values; + continue; + } + + const size_t n_sampled_increments = std::max( + 1, + static_cast(std::ceil(static_cast(current_vector_n_values) / config::SAMPLES_PER_VECTOR))); + + //! We do not take samples of non-complete duckdb vectors (usually the last one) + //! Except in the case of too little data + if (current_vector_n_values < config::SAMPLES_PER_VECTOR && sample_idx != 0) { + data_idx += current_vector_n_values; + continue; + } + + // Storing the sample of that vector + for (size_t i = 0; i < current_vector_n_values; i += n_sampled_increments) { + data_sample[sample_idx] = data[data_idx + i]; + sample_idx++; + } + data_idx += current_vector_n_values; + } + return sample_idx; +} + +} // namespace alp::sampler + +#endif diff --git a/include/alp/state.hpp b/include/alp/state.hpp new file mode 100644 index 0000000..e7fd8b8 --- /dev/null +++ b/include/alp/state.hpp @@ -0,0 +1,37 @@ +#ifndef ALP_STATE_HPP +#define ALP_STATE_HPP + +#include "alp/common.hpp" +#include "alp/config.hpp" +#include "alp/constants.hpp" +#include +#include + +namespace alp { +struct state { + SCHEME scheme {SCHEME::ALP}; + uint16_t vector_size {config::VECTOR_SIZE}; + uint16_t exceptions_count {0}; + size_t sampled_values_n {0}; + + // ALP + uint16_t k_combinations {5}; + std::vector> best_k_combinations; + uint8_t exp; + uint8_t fac; + bw_t bit_width; + int64_t for_base; + + // ALP RD + bw_t right_bit_width {0}; + bw_t left_bit_width {0}; + uint64_t right_for_base {0}; // Always 0 + uint16_t left_for_base {0}; // Always 0 + uint16_t left_parts_dict[config::MAX_RD_DICTIONARY_SIZE]; + uint8_t actual_dictionary_size; + uint32_t actual_dictionary_size_bytes; + std::unordered_map left_parts_dict_map; +}; +} // namespace alp + +#endif \ No newline at end of file diff --git a/include/alp/storer.hpp b/include/alp/storer.hpp new file mode 100644 index 0000000..bc761fb --- /dev/null +++ b/include/alp/storer.hpp @@ -0,0 +1,56 @@ +#ifndef ALP_API_MEM_STORER_HPP +#define ALP_API_MEM_STORER_HPP + +#include +#include + +namespace alp { namespace storer { + +template +struct MemStorer { + + uint8_t* out_buffer; + size_t buffer_offset; + + MemStorer() {} + MemStorer(uint8_t* out_buffer) + : out_buffer(out_buffer) + , buffer_offset(0) {} + + void set_buffer(uint8_t* out) { out_buffer = out; } + + void reset() { buffer_offset = 0; } + + size_t get_size() { return buffer_offset; } + + void store(void* in, size_t bytes_to_store) { + if (!DRY) memcpy((void*)(out_buffer + buffer_offset), in, bytes_to_store); + buffer_offset += bytes_to_store; + } +}; + +struct MemReader { + + uint8_t* in_buffer; + size_t buffer_offset; + + MemReader() {} + MemReader(uint8_t* in_buffer) + : in_buffer(in_buffer) + , buffer_offset(0) {} + + void set_buffer(uint8_t* in) { in_buffer = in; } + + void reset() { buffer_offset = 0; } + + size_t get_size() { return buffer_offset; } + + void read(void* out, size_t bytes_to_read) { + memcpy(out, (void*)(in_buffer + buffer_offset), bytes_to_read); + buffer_offset += bytes_to_read; + } +}; + +}} // namespace alp::storer + +#endif \ No newline at end of file diff --git a/include/alp/utils.hpp b/include/alp/utils.hpp new file mode 100644 index 0000000..13fa75c --- /dev/null +++ b/include/alp/utils.hpp @@ -0,0 +1,74 @@ +#ifndef ALP_UTILS_HPP +#define ALP_UTILS_HPP + +#include "alp/config.hpp" +#include "alp/encode.hpp" +#include +#include + +namespace alp { + +template +struct AlpApiUtils { + + static size_t get_rowgroup_count(size_t values_count) { + return std::ceil((double)values_count / config::ROWGROUP_SIZE); + }; + + static size_t get_complete_vector_count(size_t n_values) { + return std::floor(static_cast(n_values) / config::VECTOR_SIZE); + } + + /* + * Function to get the size of a vector after bit packing + * Note that we always store VECTOR_SIZE size vectors + */ + static size_t get_size_after_bitpacking(uint8_t bit_width) { + return align_value(config::VECTOR_SIZE * bit_width) / 8; + } + + template + static M align_value(M n) { + return ((n + (val - 1)) / val) * val; + } + + static void fill_incomplete_alp_vector(T* input_vector, + T* exceptions, + uint16_t* exceptions_positions, + uint16_t* exceptions_count, + int64_t* encoded_integers, + state& stt) { + + static auto* tmp_index = new (std::align_val_t {64}) uint64_t[1024]; + + // We fill a vector with 0s since these values will never be exceptions + for (size_t i = stt.vector_size; i < config::VECTOR_SIZE; i++) { + input_vector[i] = 0.0; + } + // We encode the vector filled with the dummy values + AlpEncode::encode(input_vector, exceptions, exceptions_positions, exceptions_count, encoded_integers, stt); + T a_non_exception_value = 0.0; + // We lookup the first non exception value from the true vector values + for (size_t i {0}; i < stt.vector_size; i++) { + if (i != tmp_index[i]) { + a_non_exception_value = input_vector[i]; + break; + } + } + // We fill the vector with this dummy value + for (size_t i = stt.vector_size; i < config::VECTOR_SIZE; i++) { + input_vector[i] = a_non_exception_value; + } + } + + static void fill_incomplete_alprd_vector(T* input_vector, const state& stt) { + // We just fill the vector with the first value + const T first_vector_value = input_vector[0]; + for (size_t i = stt.vector_size; i < config::VECTOR_SIZE; i++) { + input_vector[i] = first_vector_value; + } + } +}; +} // namespace alp + +#endif \ No newline at end of file diff --git a/include/fastlanes/ffor.hpp b/include/fastlanes/ffor.hpp new file mode 100644 index 0000000..5d90501 --- /dev/null +++ b/include/fastlanes/ffor.hpp @@ -0,0 +1,21 @@ +#ifndef FASTLANES_FFOR_HPP +#define FASTLANES_FFOR_HPP + +#include + +namespace fastlanes::generated::ffor::fallback::scalar { +void ffor(const uint64_t* __restrict in, uint64_t* __restrict out, uint8_t bw, const uint64_t* __restrict a_base_p); +void ffor(const uint32_t* __restrict in, uint32_t* __restrict out, uint8_t bw, const uint32_t* __restrict a_base_p); +void ffor(const uint16_t* __restrict in, uint16_t* __restrict out, uint8_t bw, const uint16_t* __restrict a_base_p); +void ffor(const uint8_t* __restrict in, uint8_t* __restrict out, uint8_t bw, const uint8_t* __restrict a_base_p); + +void ffor(const int64_t* __restrict in, int64_t* __restrict out, uint8_t bw, const int64_t* __restrict a_base_p); +void ffor(const int32_t* __restrict in, int32_t* __restrict out, uint8_t bw, const int32_t* __restrict a_base_p); +void ffor(const int16_t* __restrict in, int16_t* __restrict out, uint8_t bw, const int16_t* __restrict a_base_p); +void ffor(const int8_t* __restrict in, int8_t* __restrict out, uint8_t bw, const int8_t* __restrict a_base_p); + +} // namespace fastlanes::generated::ffor::fallback::scalar + +namespace ffor = fastlanes::generated::ffor::fallback::scalar; + +#endif diff --git a/include/fastlanes/macros.hpp b/include/fastlanes/macros.hpp new file mode 100644 index 0000000..f2ff89b --- /dev/null +++ b/include/fastlanes/macros.hpp @@ -0,0 +1,17 @@ +#ifndef ALP_MACROS_H +#define ALP_MACROS_H + +#define _mm256_set1_epi64 _mm256_set1_epi64x + +#define _mm128_loadu_si128 _mm_loadu_si128 +#define _mm128_storeu_si128 _mm_storeu_si128 +#define _mm128_and_si128 _mm_and_si128 +#define _mm128_or_si128 _mm_or_si128 +#define _mm128_srli_epi64 _mm_srli_epi64 +#define _mm128_slli_epi64 _mm_slli_epi64 +#define _mm128_set1_epi8 _mm_set1_epi8 +#define _mm128_set1_epi16 _mm_set1_epi16 +#define _mm128_set1_epi32 _mm_set1_epi32 +#define _mm128_set1_epi64 _mm_set1_epi64x + +#endif // ALP_MACROS_H diff --git a/include/fastlanes/unffor.hpp b/include/fastlanes/unffor.hpp new file mode 100644 index 0000000..0690930 --- /dev/null +++ b/include/fastlanes/unffor.hpp @@ -0,0 +1,20 @@ +#ifndef FASTLANES_UNFFOR_HPP +#define FASTLANES_UNFFOR_HPP + +#include + +namespace fastlanes { namespace generated { namespace unffor { namespace fallback { namespace scalar { +void unffor(const uint64_t* __restrict in, uint64_t* __restrict out, uint8_t bw, const uint64_t* __restrict a_base_p); +void unffor(const uint32_t* __restrict in, uint32_t* __restrict out, uint8_t bw, const uint32_t* __restrict a_base_p); +void unffor(const uint16_t* __restrict in, uint16_t* __restrict out, uint8_t bw, const uint16_t* __restrict a_base_p); +void unffor(const uint8_t* __restrict in, uint8_t* __restrict out, uint8_t bw, const uint8_t* __restrict a_base_p); + +void unffor(const int64_t* __restrict in, int64_t* __restrict out, uint8_t bw, const int64_t* __restrict a_base_p); +void unffor(const int32_t* __restrict in, int32_t* __restrict out, uint8_t bw, const int32_t* __restrict a_base_p); +void unffor(const int16_t* __restrict in, int16_t* __restrict out, uint8_t bw, const int16_t* __restrict a_base_p); +void unffor(const int8_t* __restrict in, int8_t* __restrict out, uint8_t bw, const int8_t* __restrict a_base_p); +}}}}} // namespace fastlanes::generated::unffor::fallback::scalar + +namespace unffor = fastlanes::generated::unffor::fallback::scalar; + +#endif diff --git a/publication/alp_compression_ratio.csv b/publication/alp_compression_ratio.csv new file mode 100644 index 0000000..8773516 --- /dev/null +++ b/publication/alp_compression_ratio.csv @@ -0,0 +1,29 @@ +dataset,size,rowgroups_count,vectors_count +Air-Pressure,16.43,1345,134493 +Arade/4,24.94,97,9657 +Basel-Temp,30.72,2,120 +Basel-Wind,29.81,2,120 +Bird-Mig,20.14,1,17 +Btc-Price,26.37,1,2 +Blockchain,36.49,3,225 +City-Temp,10.74,29,2837 +CMS/1,35.65,182,18140 +CMS/9,11.67,182,18140 +CMS/25,41.11,182,18140 +Dew-Temp,13.40,53,5287 +Bio-Temp,10.75,3719,371892 +Food-prices,23.65,21,2002 +Gov/10,30.99,1379,137816 +Gov/26,0.41,1379,137816 +Gov/30,7.48,1379,137816 +Gov/31,3.05,1379,137816 +Gov/40,0.83,1379,137816 +Medicare/1,39.35,91,9070 +Medicare/9,12.26,91,9070 +PM10-dust,8.56,3,216 +NYC/29,40.38,171,17037 +SD-bench,16.21,1,8 +Stocks-DE,11.01,426,42544 +Stocks-UK,12.59,580,57915 +Stocks-USA,7.90,2755,275465 +Wind-dir,15.89,1943,194237 diff --git a/publication/alp_rd32_compression_ratio.csv b/publication/alp_rd32_compression_ratio.csv new file mode 100644 index 0000000..a4fdfaf --- /dev/null +++ b/publication/alp_rd32_compression_ratio.csv @@ -0,0 +1,5 @@ +dataset,size,rowgroups_count,vectors_count +Dino-Vitb16,28.78,844,84365 +GPT2,28.01,1216,121524 +Grammarly-lg,29.16,7648,764740 +WAV2VEC,28.01,922,92184 diff --git a/publication/alp_rd_compression_ratio.csv b/publication/alp_rd_compression_ratio.csv new file mode 100644 index 0000000..62e8bb4 --- /dev/null +++ b/publication/alp_rd_compression_ratio.csv @@ -0,0 +1,3 @@ +dataset,size,rowgroups_count,vectors_count +POI-lat,55.74,5,415 +POI-lon,56.56,5,415 diff --git a/publication/alp_results.png b/publication/alp_results.png new file mode 100644 index 0000000000000000000000000000000000000000..6f85a556c64048fa3022feea7499f75283ce798a GIT binary patch literal 224138 zcmeFYWmp{Dwl#_d3l=N{cZUGMU4pwqkYFJsxH}}lAx&@y8r>4brKDr_ksp&~0GL9XIp`_9tJ90o=vBvu^-a__(f6ZwWJbRFp6}i$=}AP|5F$*-XWo?fp@A=#STJrh+SIHte)Q`DIrza@&t1}K z0%66QWI{RD>(oL=*WJ?kX;}ji?TKLI7>OagI56XV-*O2YIDZ6w5b1=7r-~x`y&y3; zXlm*a3?3z~3FM{>&b|&mC3E6<$Vn6~>RC3!Ful?k1gLCV0U&{#)3QSRci2#ePOlON6UcP`4d9c#ioc4SnW==f@H_ z=Dq)cMI=g0-~Sa^{cwx$vnY3`T~PWc0^%-jurq~W=MZMF=9Cq-GDP)Ke224>Ip?zo z6ua+M&Rgi)S|=yob!p|LRe*4gUJPD5GCQKliMW_JLT3S%z}iMm5o^GmRhJH|*OE0VEgJa2^7et;^Ppps9nWI2^{PQXFfMCERARPsy)wzJz>|B+@6l z=oUJ9>2?}0k7BX3FR^oJqwFTPNi9gJPv?Yfqx=jv%r}&S*h%b8^fl{^|mLsmO-YViN5}Fxbv${1`B`XgXLtC^l{E{|@6P7%$o6_rmvbps zg3!-VQ9qxq$~9fy@*svwv5SP)>A}& zKUf9A(Kdybp5dR$%Vd}l`+jy#Sry?jK?}r!{8$;jf}JCC013i=LKk(=I?6 znI}<2+0bUeo|vGRET%6$CsRa~O78NE3)D9M2&o#{7HWRd zshF&8k4}#)g9^iI**uLRjgqQ2TXiFN*%oqXJ=}v zR52}3)L>F$(iX^bku6qEDYD5QRNKfK(kfN4Rcp-`P8S<}o{60wnOiPtTHEdZl~ge* zC;Ocrt+L~%KimWA-yOmSo?RXvrwH0g79C=6E_S1@~maoIrn z{M(rezs~W`&FUy z(SBm3IXl)oV_m%ONgpYb0aJ};q~_Ogp^pXC5ofqNj5{B>C5ZwRAp=rLeKmQ-Vb*ch z0edt8V}s*LZ)=B39C3|fZs?cu3RHRaGnO*xHbt$IV${0SYaP9H58cUw@+bAF8P`bK_?cluv(qc9&{P>+E5&<)ql^r z%!+29XVB8nP#bYt-O}OJef~&{{-@f#`h)10Vt{d^;W#8xm zC)BalVu-iU{?>jLDs;|qw&G@ewtbxAnd@$Qw13Qgw0x;?C3wtrvVOIH=-pjq{EqYD zrO`Ftb;upYO|yl2*}HP3OVfL?`|NuX*zd4MutadA2;ZN)doqD=j#Q5@fXt7rfQ*4| zi0u_^{p(Lf;q)Da4MnB+jJT$_sRWO|Flr?_P7qg6<}$@{Vf(>r9Z6)W7nEYu0uGvM zcl3pM464!@A;YS}c*Dx3-JSj6d_?kLDY%US=LX3}a}{-sM-xX;mlFup{$u{z{ud$l zAxt=pQBrgRat#SZY0NS<3KZGYS-UcX&&gi;3|#JO#E(Ud5#bckDkRzYWZZYog+OWZ z6;~Aa6QpG16=qY;18{osSDt27jxvqj)HCnv?W3i#*>aV}A9NY@_{J^AmUCp%jWRiE zzO#)})^gWM^P`y}cZ(%SZsxo3Rx%qLOm3U&oNExwZmMdc{gLxhdWw7c{c%-K3Z;N* zNZ!X#XbhM7b=Qv-k}H{e|J!ajmP@Kd*7CSpN_U-210B^%;}7j)Uq)Z$pyXC7*QT_% zT~}uO%vgN4#97CwXYsv1ndfAg`A<=CW#sf$<(rL!0o|9mo;JVd3{L1z9RA!H?&>sE zT<+a=UJalEmXhVwwTXo zU%$nj#k|~CSnrmKhuhg_#E7#IliQEva)^3wr3-r2p%`?nRS2MlSepCI%`ON0aeBX!P zFJu)DQpbeiFRl-)i>gcK^>WpD#gPjIQLBF#@T zTwMG%{(C%OZfY8LWNc=4+Uqj7BQix=5u!yrDqOS6yMFn3^$W{u7I->Ix@)C(sa|vZ z`Dgq2S?ZHB8MFLKL8-#7w1@rqmHm}08%={GLI?IOTX3u59j_v#Aa4Xwg`i1eNbT_mz?U{(sjo=yDR$oIYw^34SfKMY&0uL|!N-$rA5$sc! zS5)wRlLD~mwD6)|&Irw$n(m{yBn>Nju`ZQ$45@m*Gr?3JJ_t2n-p|h=yj^tB@e%W` z)_}>#gXye?ciNp=7S<^2*_N}UYUh88cJtB0kpGfL(a`Z8>te`)T+{i9NZJD;wV;^ zd_Mks1MkOk{_zPP6$pa_{(1)9Zs~CU^=(8*I{bfa!@mN@V8qlVWM#ppx~YS?xsBs{ zTPG128V;}n#ZE@o5e5dI=J5?Ht44hY#-Fs*&~egHRNyzYwPrOovo$eib+fj690x|w zjUQ}Un>!hkyIEV=IP$v*QU3J}ez5(xn~jqEudg_L6r$8oR3VqJbucIAW_`)}l2RCz zoSa(EWb+BM#=i}pJd&$AZ!NCH)!Q$v{ z<7Di{V&h2lk3s%x97%IWQwK{sCreu!^2c$FO>CW=geWN=FZAC(|F}!vC7Df1dpB2mg7ZAlu`k|C=fPVdua00z(U< z3bOt8qzR+e(sfXRc_g-!RMr5Wz{?(gV84T3^#Aw-+pwdXB?6i_FfgJpvXWvNZm_#4 zNM5*IH{C|a{?y<6Mg4wD95my2T%m-Tz~*z8rK>RqYcPz&u8?8jGLvV>;HZdhHD9o@ zh>8dOhV#e0Kf%~Bm7gG^ZA;r0ZRhMM~MegcKW6Gpu% z`dF4XaRpk%as%JFGJ}JWS3GdBU||2_%S{b|oLJzKpfsNE!`;Oz>zP+GwYEkD9#1hb zb6E`e^co^%k$|LZ-M6f>m2~!*cGj}2^`mZU#sUR z{HU>>&1vC0{hQ`P>^)lVxH0mvw3)U!>}+-lS||DmE=IuB1{DS6ho2wJ z@zGJ_z%v}2>i>luyE?wZi%#poz1*&&a+iB zrSf-l{>GXxu-ec3>~>-1qv@2@)_Y?axVhtWUhXfpikxka_gSWPVv-7`CvsY-wffuz zXGYaC8#HKyj$k|`X#Bh6>}LSxXf_~>o`T{NB_-v1D1<~!@LgkQ6pj4MbeQ!3A8O)V zbUe!&sa5z^WId{WEh3fc|7=*PcvfU;=d00MsH1op*x)tjeVqfSddVSD4*(nvf7 z7EbT)ZjS3LhElWh^5{{YK6Ps1+(r4jwZ{NFAsLQ*?_Ev)+py5NdPnu6m9FX}{T8p~ zcEr%|@T>~+z6QbG>GkL~*#9)iVE^2Aa8tFyLeGi+&K6lh0F77<5lbWd=Vbx|TT2VG zJk^!Eu{oSs<#pvS1N?lx!8vTIG@zs2|T1FT&x(weO zw6Y0Besu|kKx*VlCc&b+KA*MC2ZvtS%<-ZkBfp%%bN;*Gmek;|AO?P}!x^u~TD(0y zE)RcL9!Ry@YiQt(7ivcgrV7cY@H=OKNln_1K>yp37{N#>3g=+)XcX&lg+@fwZt_V4 zKIQm)xzOzS7^*m)LEM8BleksyuFuw6nuxKnjfXL{{>Jj6KgS>AN}^p52Fd26k#B6X zR)L1>jAq&8n~$z^q_|lUxApt;-DxdhXb%ds$v1*j=>7IszJms_v!F>QG93iV zz!IhCO)Z4qd0UczRX@Eanr;-Vgwpw4?!MVt+uR1{or2Y#?_5mgeQ}BgP`LkKZodtz zw?UtfFiG>^TO2nq5D^gac5EsSk-PE{gl~2$rB~K^eZdIcuN?pMON%FHM5q?&u-CKj z^XnDk5cEciJPu|xlV?MU@OM_!)l`4HvUAkkS~SCOo1 zVq%hI@Y>8Q>ve6mTx>>r0NQAdV$vp9S7Rjx0?%^4J+9N6i-X#_zMldRLe9YJ(=F3s zBrP>*KWouAW?gadS2@1-S3yz{q{SRA8^NpSMBFMJf#~Wy?z`y!A%rf)A@HtDk)*K( z+3kMrTmSh9ek|iPT7%nBhD7tl?sPP*LV_qE=R2qOzPn}$=^(Maa^2VOjbXsKJM6&J z$dDAUZYu9M5W~JZIK3aH_-l&% z?hvL%m>3vjeMH(VD(`N~kREr}efB$^P6oK-N}vx9Dw_5fgqFS{fKjEiOY~K9@rj6b z;>g+OdUJwaC%+?gM3=DTG!KtdMo}LVejH)XO zy^dwl&glrk^z^s$`qNJcU{4O3!4g}g4md3k%!bG+zSxJv&Ebh*KQ0#ktVu<94KJ6-A75%Uu1G1tl9z?C+izW_i= zUMQ|iGj`E&9F<2GIQ$BZ{niMLUgbM(YxZ%+Z3^UZ!^rxppHJ}mE`Th!Q(^gj7xn{ z`~w5!7EP7@{E;;WVUmMk3BbPfGLidBQ-05L=7BU3qm;nncJ^8QGGF!^2}fhhO^{v- z@7@#W)!Hb^$1z805~nn|9hm@0+8h}gsH4L0XioFL%(*WO=I|y3Cmi_^i;@$szW56~ zLcup&!imnGA!KA^mEbY=RlPEF|)=#*2Y2z%?T zY>gKZd?n(6czJo{SKAw@GQ*PpPye4aa)~AUKjT0<8Cj*K3q5S8wzk(7P z_+#IJ_9e#S8ovbVVD&-gEebH3JAw|@pA5Y4de$w~sG^IWO)8@+-&96YU|{7R7tFw> zyZZc;#Ewq(Iywm+q7dQx?ZVPA=S)P%*!@Zv+n6G;cfaBAJFt`q3CASt=?g-GfPs<-%R;gBn2^(#qTgf3St2l4iaq0S!$P+=h3h<8I6D+ z$d>9LrH*lboGi%(xG8_<8W9=lL8+q+CR|Hq3GdO%=7JtqHXp%rl*gx~2NTXjeTWK2 z{!I@qbhEEi_B7aw!z>27o311+qp~)GetQD3;pVU;{3XUOV$rKFCu(i!w;KFa2kfM6JFqZES2*MV3L20GoNg(it*x-Zomq*e}XU zNvsT_PJ`<~iA#?xT%xpt`9hQXDOjw9iuVT(_e0_`T0FOnWwwjL1!~!$`bjU1;k*yq zkgjQM$_FPU0hk>xKi!#(kb)5BmzcI)f!h~IWAJzcOP|I7*htpZP+m_n=35^5jEO zeHVxSYLy5&7GyM)4}$!duo)gh`7;VDq{m09qJvrH-Ah~zCSS5fz#ZmQMNq#s>z6@d zBm3bVR(y z7HQsbydx8%VN~tx(&G?PA-$CA0?oo;#GAF=Ky;!!;?=iA| zI|p-Y89Gc8{*fgr=7zkGJHf!#2ZABNI z9>hRWrcnG%>Dnc=Qx$(c-c~^C?XkS`N)0kJiN`hv zLAIJ@QP}^PKq^yz6Y_NFr7vQrvx=BeY$efO8nb-Gte2L`bT)My1AlzE<><@+@L#a#GvCH9p{}_`EOkL1SO=OAOhiv3&Klg9GKOf_kJrqi&!+i>k)btF;sXC8jd!K` zsr5q+l0R!-YuVo&f*Zf&Wn3%0XM;O~eo+z4;6J1>%kGq}H);ti9w*l?qaXWnLd?(9Rq*E@gmog=Q(L!2v;R*n* zM4In?ee-;JGYP`f!}o`>!+BxXrd125#&6umrQHpa)+db-SQ(&S;zQOpWsb_rMAWl$j%)F>u+eG|=_8ZB0G5Wil zmsgWKWchtAKr-dOZ?Wum>3f{&7H_XkLNcAeLVs_M?aqiQO$j-x!C3UZAUR56tmp7N zxAItz)gGe-*Qn6Z=5d>Xi(1XTL-_*z_myzSs}zD&^95v8(HR`Se;)ez2HDPU?#}<_ zmDvEjoPwOwbd_-4jWFKR!Gnr>lD+4oP)O8C)Ey-il_g-ARpx#8CfL1k0z|~iZhJFa zCO^K(d*DL=MyHB~XN-rJCcijXG}7hrGFLV`m(uD5>n7h}y|=dj+9J-Ybfb>+&)D&} z8lQoz#(>@K@)vOR1djiYfU_X^wgZI2%U^2oO8G8rdSf+SALB_oA_qZ!5g(90ClqjD z1{6nGcsunwL$<|oLaQGF5>nx3&&d)4%kFT(o)t6SgnaefUTzTRa{xv2?8+>{f%wQe!{WPm6k z-eYoSdBl`=gzr%@bZ-+*#x4gWBtI49rT}oqXdpL*Xy%7sA#~b*bu@xSZPw%or!_6Vgap+pcAaf{-HcO%5?d7*8C*i@7`;_BDeYR|CQYyc?b)b z+kBefyT5|kzfz(tUAiREiJ-{c4yT-qoZVFn(yMI=3ZFI(AdDeZ9>HKitP$1~2d2^z zx=dJ^cY%~QD|cIL)Y{Vjq(rit@kCicfm$gPYGSc5m}1%Pk5Z9RF&}L-TV*MMhR>{$ z`%`zg(yS+=B#A}8$t^MzpG9siwU1}fOTxDGuEzL;%=b2O!EF`2o`sy;58&9Wx3*JL z=uAdlN3Zp&-bX4@eX+B`$^0i>1aJ>dY~V0TI19jrE-15_rAZ{2wYw33FKEQs5F_kj$m*z9$J~G-taNLha4^-T57Nm|xF_=g7O3s_3_SuQckk2)D#A zePm_CN{(p%f~HnZ{CbTnrb;jk#EvL#NG$OuPlAKd6i(MI!!hp$-z`|01$pY0`JL6I zPhO7ie)!$q$5g+CLBt!9BIGnzC}FaKhm8PmW@=TLf)+%ir2vp{J(p#>VnoT&kp&fk zsTLJOhuu7MYSeDrcJGpAV5;ge8u)ATomMBM?@ zi0Q7sQvhkdzwLI|O{^`X~jYqnMeM|{W_ z7~I?4raMitynh5jW;0zz;~ZzV^me(D24&yIi56&vEmv`7YpNicu)11seG#Q|~~XL5=?G;vyGlqJ%Z}&|WkG zR+&CzPiQv^!o&T|ftjmmnki)E9&By_l^)+Z43A0bpyh^@{f(_GeO^j>Ei%-TjWjDj zI_`IKQ2ET(bW?vsc@_P1@{52;s=?6h-l&Oo_ArWG#SCCawf(tz^<>gk%*(BSHMwSw zGX)&E4;OFBO%Uy$NmaJj2v}jOr;EZe8aCA_Dx)Qw1zfM8%-wEqr@X;>eZ(^`BwZnd z9F$eoUC&@tDHziRond8|_D(yLQu1p6tJ6%nEkx}~k6F7U`Fc&ZLf~!YU$UrXE>1K+ zT8EEWrg=@$YEq3B9;`VZ?F+s01AT6l_&X(Ne zy68<8UA^TE0m`OqY3qFhF5jbo(m^>Wsl;d`Sg+VG#*>SEuoTP&NqW;A|lSaQ<2x_ zd-8k^YcD?rdI+G{#T?bs3_E#Cho^YH;JN6F4hVH(R)973=|-O`i?a>{H9*%x4j4A_?gn3pMheHD7NRp6*r-^;$Ag zk{gwYJlt|RZ~ksGxlZOS75nmJWR&OCs~BE5G#YOll@h{$&vZ1<#*;%<#n~evg9b-g z!WMPIN|c*W8?Hu`1*;kw-KvXpOBUkEQlm6(>lx~x9v<<_<>7Go+3vfb+mIV*%ZilR1dT#qvG_+aln{#ePq&))D4yj znE7j7ItHK?_;A|(-sby7KdN+fbXf(Prs52CZPaV+NIdd)O@#*cQw*8_bS8jzncNRT zCz1l-w&wZ5T2J)W-NjkSq;}EgBdE06|9h{Q`iyc8T(ZS*jf^+!y^K+t?rO`enYR}) zVX6x-5V?2ux>r`WAe~u)f)Q&XrS#s(#v#bH1|uT;)hQrR&g%Z4`3ovO=6%c8FQBln z2UI_=r@XP9chh>5-y2UknKh~0p^;0#iWc4@txv?TAt&+ zdwY!V=SRf0ORhru!&D*-OTa2{-QmOKj%=YWKUny+SC3F;Yr}dcxyskh=W-dzGRSnN zWyXaWx6*=_fOA>L_aw~Lqw2%B+$?(B9=YB51$`vKK_H3e)Z!~gJ6gsYRfgWRjk$}E zJF0STOr@7MJ#=ZmNW%S17s@_PvYIravVQvXsnbREmS-~YQKlVfY&GkN0y-|rL|Y5c zIFK@@84As8RW~*CQuP%v+R||6K)j3ln6I<{@~DihkR{Kw zN5nh&lZzm1nfPOBrW7)srTXvkl(e6QfKA`t_oNC5%-1_gL%u~9aogJ5 z>Hgfiyz9b$dAl0d8g3s2jd>|ITQFa1o94y1^!vwy*UVK|o+^jdym(8MpV`~9ELm2yzpiDlF(k^wa`Y)JQU_Tav;kuu$E4Sge#;1a9}Go({Wu1AXlo<*gGv*H%AvS6pg2`5A>%c zlVR-@>n=0)kG6#FUI`5;lb3Z#0{oqQ!KdtRE4(H2dw@0pWY}4_a1sG6jE(Gg!;Gtw zjU;sRGg<^v8sUr%Bht(|q9aiK`38ae2;JcRSxjm_aN%vCjpDm>~ZO2O~eBQ zTArxzV!>0nRn}m!#yTmyLhE)g7M)r6#G(U`oY7`KAVAeKJ}QbZaQhMf(BCAZg(Dyc zM#3N&QV&x#*xgV9x~sfl`+8KF5_zxTOV2T!SEX7bDx2!nT(&s`cLHs$8aVkR=W5{U z3J>Z-B@JO55eAur13Rp;JfxP@fs;k(IyWp|$Hgd9)cJ$a^6S1{jiT&WyQ144Q=r%~ zUSg1Re|w@le*Oap4Ufh6q<4`M?e>c;-j@9d><(6-H6~j2#Oek8i+qHVgI3u{M^ZXp zbtZ!*EDio|a!sn8UDEbS7d5ERTS?4WG5bUTSJvj+k1~MGDnxH&t5OKNCs4Fd8-agKy;CUdR=0_XP zqT&bWhJOtWt#D&I;zw&@{^ihl1lm@{?yAs|t3c5pX~coCe$DN_$_fU2CcHpA-TMP- zLL*UsMzZBcNLPVaR4y~vc0zB|B1i%XYT>E$?l*h2A&48G3uI)uBWT=bbQKQ39JdMf z$0j9Ly0@iv#LhdE)?o{>Jo{ZLewX`(!pgb@F}cZIi&Cy>oZay*2oM7SF;CXr+(t}O z23~nA%^{GChn-6vaE(9zeET^s z3M-rnbv#2KmTz9TR|gH`sJJI_fSwc6F3GkZ5iTpfa-V$_wDJ8ri4o(2;5ds8CznCq zO-_JazhNhkq9`DtX5}&rkGNYg={#Y%&VF?>GeTI6JFz7v+pgdDcV7%cCN1i0ojnyO zHVrvhO}$kl-0JO)5`1e30hd29+`2M zz=DF^`p)H4xhco@1Vq_PlR~?uw{37AMsvQNzWPJ=p0rPsDv%5Z;m*#DRUB7USdW|m=C2}-# z&S;Sv&IjF$#MdNRqHjINNU!(Ot7!~3dP(x-7LH-o0uXSI5^&bK1nXY^faRlC?^bZg z!Stu04eRy75fiLbhE2Ss_q7v7J@vNgg5EgTTYN<26CHRUC;s?0tM)a7+KfHe_nX7m za1fc#`!lMMU=+{f`y(jXapd%)jGUtou}Cguts3ux>=tF$^r!#H^~qEyor6QcdZ`=< zQ;mIZ`KUPWAL&&d(BK4Ud^J`iuU*zElbf?f07Qpl2?q3kFYL%*rjJ4%Z$s6hq7*<-7bSgZ_nzQwAvMv-jj|ofNbl} zulAT;P&?kN-$>02BVZdTENzkFv=~&dB(PFOLJ*YYO=VEn#ncx(lpK^x{V6ymP2J-t zWnXPql&r244viT3h(O!p_F*?x9%in*&JL(XV40NQ_ z1!YbyIU;@0pV27P)HJ_mIFsJ1ui+ncOkMUMWnQhL^-34_2}*Q1;^Fpa@3GP(P;KaW}OCY^H6NQ=%gRXV}hZcQ#3TH41t(({9)-Cau)GU`s z3%~7xpmpq41^0Jb(4IAxE(SkZGqogyfI-YFdi$$Ywccg#yTW#s73c9!jT)h?Np>Q- zk=RD<)L$yy_LK+gsQv7+mC>fw9dDlB;ge;V+4AP*r@31Q`3Z(%yfJ3Kvjjg z-%gSpZTj|$Ut0PSk!f;S7!~8R6cuRX=s`!DRY1Pwyn8jzXW1zFcI0cvINp;uip8J( z7zb(i0v`fay{?XX-oH^&gGScwV)q9GXetpyAJpVn`&yQFG5Zg!YXX-(elKQLO5vxu zamcajU!nEQIni-XjOgO|h8H;_!)w^=QLWGdbzB441OG~R2UMmt^VMEYEc;?X?9tr_ zKE00=@;Se+*IG>CZLhOCZ*N~@C$Oh>$O;zczOfmvp82T3?aFWf_hs6R7}nE&l}Z~j z>e^WRCG$h5c_?gJ9V;cID;f#ZWt}dMu^ZeUa+Y5?m=Dk#H(d76J-h7j zFD%zap9R!pBy3$z9eo*P7h`?xE2q=`#Omhq5_6jdi%~XiQ1$kyhn~+ZTcJzw>*LBN z0XsV5nQ#b#UjYkJ7BE$MVVkd(Jw{gj*fJI3p>w-VFZ2fOCg6Cr+P3)&-y7t%n*Bf* zQiXKs2+~e1ni^bJd`M6+iA1eG!EiJqRT5^6hgVl*d7T>XJBk{&qm@ooD6xH!U`7cE zowP;;A_9R@oa_DVm02c(0R{0%-6Qnpdjjd~k7FUyEHzBIzi53Bdl{!e{IEFOL$|eX z`#?O-UE;<%29&rw2IrQh~5o zK;QHs2|vH|Izc#uuWpvTbTQ&b4iq4D3{kh;WLTs z+{bPiON)M#RUmyat`DZXIq#evU-Ud^#?4F*#}EMZud4*cu;0Y%K`VC%yah2-yw(i+ z%*2=m0^e;!>a6=->5l|94<|40tjjGYq*1^1HO2S8O4H@@zJ66+%McHw4SCd>ch-;7 z_?3+mKJ)~IIML}d_;*`ZA1L>^PK~SV@zDChy{q1G0#&H{d@3jo;aL(ngWd=C(e+Vf zk!)>4_khcTf~c1rx>|TgM@M5>vN&8B33uU*&O1_|prK`ZnPsN=qyH<5F|R%CgV0hr zQ$QNb5?%b%AfpfgXhyo)^&FZTLTy!2!ss6EoKe%39BS6diM=Mwp6~8!CDoS^c>T`Z zn~=5T=F&4VH{Zuc$JH0^92L4RLUr-{$MgYXvt3d6`NrMJ`=J16jkX*~>pQ%V4&je; z8zHDCll$}Osyh!itQLiKMJY&eVn2xh{XOrl{NA2Cbx;ZF)yMXuk4n(*881$ekzNtO zyW86}s**n}%a!&5z4xYR24-`#c;EeS16?i-bD)Gcyw-k&S$qoVS`JP+P=Epoa*L zvfwf^pL=*;980D}s(YaD)f&zjQXy4P0*5)O^TC3F6w}FS3H?@*UzCjEk0sFm7d|gC zJj^5=`y*%TBgBvRRr<2&53Xt?msq?0g#u$^<4%-X9T>U6_%k5BX#k;j%c%SFFi@)2 zR?nmyE2Vk=Y|6ASbP0DQt0=qyfX=ec=4!Y7#L2qx3=?*e=XA`Zv<-e7j<4RpHecjsiLsl% zW^>C*hwyAyI;cHypcpk_m3hDB;22aHlI_vgB&8L`U2T?*=!C7f zdJ=3(>pL2iS!TL=fQ`1B=*^+3Wo)_>K8GiDvO1*C+N(}4Wv!+Tb*xW26m`M=Hods- zepqjU_F2-NN;Q=UTQij*whj9TKoON&PwM+sVRe48o&lq+uvuzhg>;0Yh}Ow>Kp3i@ zPj2}-MCBMt5JJmo(NCRuVBoHJ(sU5|F(%A6hgr#eWcX=lb33~cr}qP?)!8ffiy>}a zws_x#T@|gf{xidNZ}(=q*8aHCRfU!Dg$kh$Ja;a3pSo<6f#PsMLr1BI3-0#0k6uS4 zpRU3aCh~8nz%SHchtyryk;TuM*QGT;WA^spD_(~Xs)Q-^Q$P?fF20a7fAOP3fe!XQ z+5MGRh^6((uymv?(xCXjynFLBkuMmez1VlN6ESREV<(xfR?J(X*44RHhn)#8{d!4*&#s~gz*6#^z(tWxvJ#-*c<+Eb;eiT+* zXW;vaB%Sq*lqC5=*5Nn1$mazmZY&8a!UX&J83qLB!{vq0b#-Hpu<*HQ2lXxeY=7JL zn!QOZ5thDh1`pEJCx;Sj*s%2!E+Ja+Edy_c4zke2sV7AO;@+GZu@?z)u|6}Vs3Re0 z$t_eW&i$3ZY4QO5%t{taR=#nr7|NYaeBb?ft*uYmajnptd zdx<(F3SpQu^NSnv<6INIP&IV+2ZzsJBER*YQxZ|uCXi;j4Q6QMzD2LUOGxhH=O=K~ z3Rah`fFRCL@`e89q)E%T6*JDcn}ak;%=(k`P_XARxw7Ge#mE`MrVM!olvQXj`En&W zcpP%oYL_kF56<&jnA6289-70JMjbvc`Ce;MvBDh@P@37%o+hxZg*%UJIv!V)SgH2u zC!9v1opV7#ZV{sNpFtkQC?;Z<<8WT>nwUOH!nu-7Z%Uo@1cj&Zf`vd}P_sa_-aQ5}AE?sE^v#`bq@HuW>J%Zjh?)N__l#Dh0WW3U>oMXO$j?R%U&WgB0s>rkiU$atf6!28?Z1uS%bm{KdB zkM|pC>I)zi)iB}R@^zqyUftC^WSps2?^+k1S^l{@ezN}K-D@IQ%w70+(v2LI>(N&H z0c_TU$q{X$66x;b=*j(%9lG^V(wU*>3lv5wK8a>kIfsga45{}r`G;Ra5rg7KWQWOG z*^YxhE={Yk~i>D{ZG7)r{W3-_~e?U2`&tig`Z~Xf?lTYFDZfw8xlx5L|#={dqs_wL~~)eR~}7!MMk$%GCYMnBMhHT5L{Z zD|H6*POzDtZ;GA-lX4NaHI(SFIW2{$n#LDfH=hL*J2R$l{=6?_jynM*0^u_c*zuHQ5!obAa_(551Dh}Q>&?9Jn73PCwKnZ%EFh4&iA z_h&H_%K>slJ`N-nua}EKqX;A3z}2e&Lw!`yXFZW z()Y3eMQts8*$!PG@leCRYODTUIK7kglTr2Q1|~)NGC~9aOO++rKE`{~VIVH}Tkt8# z4I%yo&uP8G_ON21PQ&pDmS^>q5iC`HvD+u83#(b;9x^^Kl*8ngporA3;;m;_Q$zil zLcm=)I?D!+aq2Y`Xu}sZXEQ*#KIhWX@%d6r-yiPWn&3`wn{{{{P8=sf79sI$clV^b}{$%mFKjSVHsFl;A zOVS`)FC6}-MJOJ4@TKIyM3s(g=Vo%MoCS)(!&Da>TKF6%=ges}sdejIf|$MNpe8kx z`ar!hsxDj(ZOnBpdWYca3S`x(NOMU#o<)YMD)WnEt;rzKC}kvHT&)G^ zavGoL#T0U@bjALj$zph|l2U^ekq=!HsJ>Zn7KNMoZ9)$M7D?162hsF2%YuJ3Jc1H* zbRgszFR)7ca&bcd^=6j9D`WXWi#Y1EN-<>MrtQCZb@DBnuE?@>_{Z8wfsdET_4_03 zMj92!*c}7qXDm=5y&OHwse@kHD>Ab}pP$4?4c86F1TzDP!Qah3V_uT52Z#pk7DbTu z^;Z?|^U2@L3eAFvt^chmPp-kX`=-;63H>?}Vq7h2HKII9JnwB+eDQ*>ABI%1wbg~T z(qz(o4~q&it4O#d2^==ofT9RWUcEhna4D$vZTDomcOZe;9OMuzD56L>tHND3qVDm! zmVKUN6%=CHXJD$t%aM6+ptHJ}?RsB4kb9>+LS6w?uXG)RjDT|0wgnzNc0J= zqu*X&nQSvE1Cfw%6CwBFV;=W0E_b?mqD@uBBJMJt0(!gN?o=r=O~?(hrF2l%RFO(a zV$JuJI!~6fD4poy?W+x7o(nyI;=$F!v|r!UKG+wW3tCP=C#i#TlW!jSE(yoR#@;&q zli%6?tqWdUEzzk(F9;of!y|USB7~zo21wd+m!NA$2o&De*)^rBR|HxX!0D*+IAkh= z9$ixgB*?%OB;VG4)Bo(t8@E#3o6V8aSe2{} z38#dkSxpG48%*&mQYk7Z5|YNK7|9Et;NQ$7YXxsknaylOkSpB=4tcF}xENNJ1f7 zY0r%r<#0rsErpVcw+#dN8@)s3=1@x0Jq$VJUT56fPROWJ(DF~k7rO671P@b6e-@XQ zEAL1p1j&Xds%Gd){)u@i{W+kvHC^x+qRK@cSMOtK_gI3_(GY+wR2x)^_$nd|WBW^{ zQpxOEc9zt5AztQDQX0KRXf0;FBp}=G56Fw z0E@A{BIPyH^8UTwBZPo?u5()TDVQRH6jQh=H6TFZ=ToP2ylHb>hwQE9g4ybxM5TG7 z_Y-o|;%njgTJ5d9T#wRvmfbg@oYvnQ4!2FI=8o_Q7;nFUv@*9%@0WY&CCahGKFEeA zf}dAZf<4`3Q$s&T5P7$teDQ{*z$)*P|8XWaQ>)tZitFg&8n1-W{g0yZ zeYKzQKvk{;9QK<`^PekRPJoX(w3IWxbD5CmhCHI5>>+&#R(JA7ay`7isW$pmtgrgj znm^Z6x*l6q3p1Uo(Tlq3te`a}t6|i6PEPDw6_~P#)E_a)}R|~ux zf|gIby=t|mG3vC5z3y&>jIk;;~A|54Z?Yy8Qg6Nn_vn<0+;^uzei*GbdqVp&TJyOvU64z4!x;BwNE zIwgW{xY4uR?oHU-7n_Y7rDBTxO}7^BsLL{rx;gXMGMHVkA=&unllBiDR7(^D)NKlG z&Ap@q-YwA4XidT4LOd23D86kv*;l=N z(3{7oJzWlRv1}FZ37W1}oHu*#`%0 zHJWK?gIUt-g$-9I4;%$yo!;oY50N;UgI07M8I711t7jV@y3aOt)Ir*qEGo?(1cK;K zy1cT25Ym9_L890(@U%-m9? zwuAi`1xL3E0*O_z>9+hLp|{+D0^xP9QQFK6&tH!2pUwVm3cA}MWm*m!DQJFr6}s_W zEBwm&Zw_W{NH+)Sj2O_P$R@YcLnNO^0W&pj>1MuO=~;llvV>9wSBF`pO;OpGeT<1MmcH=~UEIc9oH zWbidp`o>SW4ibs9{j&9L3j}dRrcHLcd;~eq%ddXd`BtWX{NHd7GnDKy3p(4-@}!{k znT-84fq3l~+}j1#B4N?jlCbW-J`eJWmwbEoQLnTtEzAwSlPOec^MwW9L1z_iiP=Rz z=Nwa`6>aQkuxzKO%h)QN>OnQy(F#@6o#uV;ysdLY=mtL};=fL1#!qe$QdWn*pdwA> z6{x@Ia8P$6Qfv#N;JLzwjoGj0>@p^C!q#fq*?E~#%ZJFkXEAZ9NyKxjUm^&8E5EQe zUcGkSw{D{Itd7`Vv~mp)Nr?c-NWl&svkTjjf4Z%u!Q<_x7PM7ob>5gBN`=RNrhs^^ z!ThB6+%atHc5D31CEG8{MfPiGrA`px&pAFt{a0kw^-h(sw$eG0od#5L*Kf0OFsawH z*hoTU z8iMHZFy6><@r5&?frzVRQk9|7;Yf7{jz9WqTY+Pk^lhuo3fIT_9bj4Qv(= zaQ>!N&1y&1q%~T9dELA`Q}DiiSF-4Jn&jU#*}!X4t1H7x*Y=c=>J(e6DP7{U$AoPj zt3{%cLiNn`4>}fQo^>o+x z=c20NZBUCHt)}I4<&Pb`*BAy)phqj(9KS4GY``USBPq;InniW$isDZ`<@AkXEI}+{h2cvj|rHZ4qC(X-#%}vgs^>+t}1ZW zHt_%Wx^Z=oKp(04Z*e0ZTHo1=2f6v-qY9GT6pFfWc^l^3iM=>wrVCdjI8#{Z6#DHS z3gS2%+~tTTpx3N^q@ke+i%Vkci30`QHnzwU0E#+aonX}&GvZARC=+wp7a&Ho<)jkGazG};vF2E3*ej0yc~w_T|tHOPEQGv z&;9o8yM9fKMn=kke~S8#^RBR=M;?=x{TA^J(O2xa(H@g-Bl4jO9w)KKry03qa$co0 zEgAEuF{Zg|A6I!A^UO?Q>b9|+kr4t*{lJ$@VPv1Widz^$^qP;Kx@c?Kceo!KJ~F1> zjKAFcy=odnZ{zxV-Kg~T5=E@DA%)s5orR8{CtVl%1l`0Mu$)@}K!u?cOlsm)^peNH zGTHpqsk3IY$Lt^w4QO)e;qTJ6q+j%WZQ~$g5tiTsrNw(sRLyj>363X_$}q zQBa3Z5uB|Qx=yr#JZ|0^IG=r7ez8Zn!T>hG=x<%#K+{Qp$j{E3*@M`P?ooKMF1sf- zH8+7L7kFjRXsUBagS5)SX^M9DPI^UQoTjN}l}YFRjzd7&bHj`qPnjtey*)n5Gho$x z|FziuzeKQ7nm5fmzFX10c04d+)`DNEt&~cflBQL)m(YNA>s`K^OxmI*sSOI1=49UQ zX}%71a{Q5^SCV3TFT8kF^x4+Cq>JAiRK^uc@KlE_w6*oO3dlAH#R7s%D`AUiNaihm zWZG_8TA5yxM^M`g&zNJg0O@%r`;mt{YMbCumL-0z^%NV%GE2{OE%|S0t-7;oS=!rm zR3k0vlJQSUhWZJgmG|+ZhjLbx!gH1(tsz_F$u^vU;cm?1mdKDrlTN$(R2}h(a1XRT zn`zeo>7c||2Q9+Jltl7}zZRK|)tf(-AOd-<5t9-+NLt?p@wcylTWB%yNtIVxU`HV* zVft;oE&b|sh=9Yt(M{1O%=_hhXZ-n4)VpG^2?ouY7Nkr!R#E5*Q`W*ZmBjB!Muy}WTG7>ZJ6yur8qumgygS7pf5LS-#`mnC@uV6H&&__;T0}@b46`@zfLd-dKRmi z1hHiw{C{*#bJ>-S3!Ww@;zqInH{(gRwcxmGq+d^!LYX@+V;(do^6zc>s;fDW?%K`S z_9|$Lejd0acD(I1FB$%`#VVsy;#61cl4o#Ixe3f5mrmBN5IYh@X5KQJti@?47pS52 zbq4ESH&v>d`9JvYRQG0SPV&is?V)vRy-l@;KP+j|S4=!sYLIZv=70qRhd$&_X_pa4 zi*@lEUtViNa@G_fHy#@U5Ur-1DPt{#fKI2F@8y;`mM zi4^|bqIfvp!q}PuLu-_Hs86WtAd4XW3r5N+F0~x&6S{XS;0jxl7iE|Nk&2YbNe81VaB;3E{nE^s32-FlX?) zgjYJeL4s?Q<;_X^&s%B6Vz=5Eo0JZ3O!3azZLu{@uamc{4N99YlvP}BN$};i_%*Mu zcV*0`J z84oAFTn{uW~0#y|&YcWNx>Z+TC z7n-iuE+s=IZF893^~cXFwj;XIix zW|hZae!+d}m+@|E=#9MZic3qwf2-{4ivSh_-?O1O=dg%~TmT^)P3MhyXc7?>H3tPI z_(R- zwTyqO+ptW_M0uQF$)0aQ;jN6n(o)Cv%m;dMnC3lC5xeBW!6gCHdVpFMcl z1&&!;69ix_?xHl9sIcQmxMO?yL&L+T&f=>gHG6emO%^UB9v>?eN_O3(C1PB-IYA+Y=+{=b?!)dwS^k}oV(}fj;Hl^^1m(M^r^`yg+> zpoUr^1PBY?%T=d`psa&A#R3{UTMq1B&WyfdpL~1gW8CqPVf%paNcRUDw|D+q6}_7+ zr1_eE)-s6^h^ZxF)Ux2Fr_I^SuEgL$wX0H)*2Y}o}!=yUW~8!HJ5R9OHF$APYyrr`UvTF zGOOqTu{&2p`j(ruK)VvHf>vG@b&3WFj||Tig51KpgyCg=8>jvM?$i!=mBsb7{x+0W z*zj{hDD@w6XJ>O(B<m%7v=zL28YtW9<1CS&MTH?!{N4sF+cDBt+Bdr@On@^e4p zos>fil#-yUw&!0{of;39r5@nV4$M+eEYB&2b>d_y*== zw1{L}aL>U;L2ctQvsAHR9cpOv(1&_VYd_|T- zATpzcEJ8Hm(tolk@_fE9qe+|VEX3)DZOht*g(l-S$vv3>>Wd)Pc||?z+TpSB9+6I# z;W3~K{1>uVJy8esL^8Tmo36#Ut!t6BwCwzF;w|v{)8Sg&Scn7{1vC?VFz-Nw)!{JH zdfR!7q992N2wy`uasAK!r`)Q(U;Y!QY=c-AqE%HlnI(r_y;C0kfy58y z)rsS3^75VD(e?1CWUExPt$X{H7Qr;yo4=WQEM4|)0}4sS88s^QZoP+D|NbT9s$Kxr z-E<)SAOiw(a(tlACgr6BSs#7xU(*sv^~G+fLszm&fzO+-OT5rS@P1_0E-M7*wAtUW zY`b7fcAcG^qARq0b>*O8kcpi`#-RMn3_`KGZ>PVU0Ymf$xd8c-zG3?sz`V$B4Q|EY z!WgpQz_JTPWi|$|1OSP;K(RM6qMa;K3ISWCn2B}#`{UI2u$t`M+4KJwKEx-=*Iv^f z^>J+%EDE8DEh+~pr`(X68a}cX&v@@SZpl|94!@5Qn#?e^nWTIW!Dm7=MOZ2mR>t5+X77St-fuV8?`)Q)jh5kbd7!g?j8ly!#U=> zc=V@dz^YLp_~U@Zn)6bDk7d_2NPL7!_Ern65Ly?t#N$^MCS~olGhe~IiT*pg{0F9K z{ip%e%$`j3nyeUnr^)TE(e$i`>-nvnYFPiZgvfF*g@P@Ih_cZj@2&e?{Y17R-&d?W zfA@5Xt4F490WamJ`N9t-PgNAb9I->Q8)p^Ony%}N0J2Rd4Nj+m<97VDWW}YQkM9!t zRTMQ;^fY+dT+cXUXRk^%`!FP8ZkEHFWxJ*#>&*I?feginU~G_j(O-1RyT>N%B%X_C zgau+QV0A!LVyOH_PldME(okgt-$dc+XyTtVooIx{f#EANRf0wh-J*rz;#L~`^cr>v zTAPWadW#m}d{LQyB|8g447V10)2bKR`i?Kj8ax9{4pXcr+g#1;-0LL>Biy~4^na%+ zs2+Xza_%{6TI9X@j^H36VSddMi<4&=x`t)*?zi-y>ZOuyOz6nKQMc)jHrhV7$5pz{ za@RC(!ktFzjpZTk)i%yha5h`BhqnwvVwD!U_l#W)dn_mmpTS{*8Ad|1R5t_QVz&qN z%Z2}g-kiWI;I^1x*j;XKpF0e|djA~__f_HEZIDK_2}Dl_sD0%H;xDW#GQIzsTn?E3 zuagV=?l`lCVN??7Y{tQONSrOvD_VL3YQa#b9o8!66TvXq+WFpBCiGaGyin$6QfxdW zO{ifYLlKLyeNr}eNo!8&w?9BPpY-8x4QZ9p05#2b1o6D8wu+f|Ae~-aNo}wb}<(dP=?_ zJ-ZB&eEL50RV)6-t|cZ_T^(UgNvkmkb8A*3ox0z6{A$y9P$}84AM|^h(CKy$#jPq>@F-6ep2wHbhaSwu%WHJkgY5p@eT?Z^~=t|EJaebio|m-wH(D}i4!qHVBmO|eb?#`65gQfBpxUrKCv zi4d*lVsnxyqfk+u7gnjr^2vu_)KW|we6a&v+-}osv~3MUH{Y#P-cy%x%axfae`_}X zmK-mIpyu@jf9THxEjQLCk4ZnItEYCRi;^Vy3YfOb8mQi6s_#!FHi;~Fe+XTe>vVV% zKCv}2)ke+x%W?~Zc}&x$v}7ksl3oHa-beO$llbq=B0L>ud{Z%KGtW_;&Q7# zzrJ=gw}iZWxIRe*9%P{7#5*a`2n!3#)^4;{@PkJRrHNH&$$B=QSz(i_uxj~TyYZ=w z?0?B$<`Z~Maxik~2g)%_m}i*hE6C;010d68bCpGt5?4xrOGI@`~JqhH|yLZs*LMb9OZ6ymFh~ za+o(?UG3R*)@8^jd0Sg@?BC*3i*!_fI`K~PWMsLeFj1O1C$Xpg zXIsnxUdo@7KlD3EU1$Hwu=nE)jFdXopX^3PI-S@2#|PJH<+3%;t%?Y~<@X?0cCKes zJ&=}!PBX1}woy{AyYYMeHA)C=&}018u=7V|GpnF!?RJ`wfU6#j%}C%)zD{z*sH+U> z^{tf38z8*7s23NrJt6Dm_KEjaa$jV=4>2XZ#ya!1YCR3E`1Try^QL+e!_F|ohUp)M zPnqM->#Fk+RjAqEKK*sCxaf(l;rn?c3OV23+*y%7nMCeI|2@pvD1}zkvoqZ?xLT(H zi9~*~k6vg?QqT74-CD1hsvWLhd5MF_?QlHpD1>cA0Y+&+Vd`=CDNqQp0E-d}2?kDT%ZHHu5{PJz zC#n=oj}^yRmWs((lNzVQ;a7d1ZV^)|rVo_YLdQ_AzIu4IFZfuQ-#8Z>bUe*eU*CYN zzK}%?g%0n0J2b}I7Vwex9-tLFOS`f`!$ABK>L^D(Rd$a-FxCyVs8*|ZKc1B9(#Vz- zNgR~r{JfGgux?@-(Ia28c>SIICeIK=w_jqcdhWqBrF_J9Py-|jynY5MM8{^*bt^xd zxC%zAN8_&1NYYzk#MC;am}1&gc-DJx0oU8YE^>skk8#S~k;3-N0h&zo6`*iuKDcR} zkee}3e!*O>oPXVk$F%F=`mVoj+wCUe%FY*iaCTc;i|1tZs&iYD!mgxd z)iOS}KDw#1$sm(VZ3W#E`_q;qn=&r694o&&T-!17Dz_Wd!22xsVUPYv$8>)10=ru=mM% z>w?$Dg~MUVYr75*rhy(!41nwlfGLyKvYrd~;4K_IT#pRthUXC^|!{sR!nf>ZTz6MRS!uPu4DzA^O^Tk6rcK0HCrgUh$ z4t-=aHabo^vF5FT+ZNRvn3Mgd*DkA0%@VH5?+jcpMrP#_=H_&ej-uvA4s$p=4W|f( zkB|428(Sx+r!;xw`E9sSKVY)he-vPl(s0CErk8p5n=(c%)H38eAd#>{K}%d(seV@{ zatm_PJbaLw;@{xX{k~z(`pp{A1$*#j-_&Sb9GYr5W}ewg$VO?JIx0h1wn|jpE42Pd zTB;2F@>YhoR)&FB|MO#_-M_Q)HEI6UBC*!sz@A&@Yd1;6wD&3k1smlVskv>j_%>d1;@f z^E&*?4k72zegNuu7~n0~K?cRumO-tIEI%(#ULtritt9_8{m={b9^mLUn0^ zY@^0hIMHUdjxh!=s+CoCy$NNCsPZ&22gZ?D;&UFcCz+4T#!yJ#r?7(~l^ka2m_9?1 zFKfxi>2mnfN>cYIyrJPDi8*yMk67?Jf4(Dm985WtLL==0l(*- zQ0ccRvw%B)hoi| zb7Di1zNt`UtQix&s{epw{Mf{F-NY;Jv=BCO{N*et6Agc9NtmzyR9kuC+^VC5Z$^=v3ri2*94hv#3PhAl9_8WL4QaG9Jlh2Tv@@R?C zHvgzZZ?%{e*K3NSX3g$OZvi`*$Y)^IT@%26B)gmPB+(fHokEOOn$pwB0A(ImHu<&R z?qmtQn)jPGZ+5PheF|VMr{T;8HrozBV82Y1sIkFVb*k)g z%tBzv>_7IKtj#I>;k<^uU!tL*a`vz5lc1=3=Pr7aXBp}Mxos1*77(MJs!2Bbb z2aea_JPv$)j}7hBoJ!7uHWMC>w6_R(SQLao3iiBMleF>#i0yN~cvADX`dy~+2@czC ziaO2OM!Y0Tj;KhtD8+F%>h$2pO`%DB2?oQ8P6lQwVFTkfh6$o}fG-tX47MAnBlbHpumg-rJC8Y-ktG# zW1D%A#*nMA0~x~5FUybR2KvI}YK3x-7bNmhM|Xv`PSzjto|ujyG%Jg+*|B)mmurfs z?e}ZI&}XdwPr;7uuD+opnB9cd1qU0*PN-}(NQ1wnw2pO|mh zIy~=z=UAlgBkDDdU3yBQgxcj%0cn5#xaGKRjsLm#XPyInf{b%Q-Q zq`_$eRVVF~c+=`Oj)8vua0)%4M4}@gE3d-iu(mmxjev~P0`CDB10<4yc@ThEGDIPj zhJNNHOt?D-h<$*Umem_WaU|u$T7nCPZ4Z%_y9@Oi8=k`YI(=A2!1M|~2pK=yE~9r& zClg~1wjs?E*`fWnR%&2#0|Q^q5KQC{$T8B|wX^e!{G+80YF7s?*SC}SB24iG2C}K& zd~7S;X)3ESMiESsx8Dg^H03LLIRVjOmi3ew7?)ufAs`S+ zsN(ka_Ex=vsV(!-6qkuTPI?F#`S}-8inaUj;DqILZ_f<}Zf{6eHPS6pEw9|NnVc96 zI*rh4L|OX}B?Q!8zj+f!z^3~V@JjPw-Cux5GOtF-(Xr;f6}0h?MJ^Gyf}5GEwKNBJ zXjtLto#%DM$1WTFt1k|VGAZVLf}lGSH>dU7gggfwT~K($OGukVeURy0jrm_QU_za! z&=+;ToHe!9W%_*qWZF8tW;4~MSc~A`wq*m&ph$q5BI2-do196zp<>7xhD{57ItCk% zg1dW@ZB@H0=uJUAc-DHg#C#NI#tj{8mVRlu1ffd&53CdfW4JwZ zl4@IP(#DwIU3;q*$_wj6#bgLk?wy-A-FMSC-rb|oTUaWiq2|P@EoE1D+?g4XeW287j0H4^~VbmK_;!Ruh5@eQmKz@u|~37`8pLpDFOyqn#df>K_evXD); zg`tlM*K?mDwzlC3fP~mi2ZW|l-EGr!Ikr&X{Dfz^iq*j&D6kO^SaFz! z9RQQCB|iDLFTfP>6bpXoBbjLGT4xqBBllTin=d-n+?w^g{MV|@bL*?+DKOXx!ktQ0 z01O?UoXijhqm+LLUq#$F0W3dJm|2IKHl7)jqCu>N1O&_jj%}?iSCQG$6HH(vS<4s zy8K4V!P2*VNn=xYp(OUb8Z5BoPXV6d`5S$mLCB1!rnv)E%pr@WVU;*=m1bp<7sX{)f2oN zreONsw#s6XiG<&^g4R4C2oEgbHqmuNBf3)MVh%F;qpWB8-9;36 zff;iSUjL5~_HVY<^KaRh(D+5L%Iy1)oDF~Bk~F`W0h1Ya5c9`??J+Y8fgBFR9a&$%YIeHOf66q` zU1_gDsRlz|gYTO5(I2lHKwtBWNZ(Vq|D*Efe)Un7j1)Z-0ow#nz%$QRMBM>d%^ZM9 z@o;kwN-;Ydfl9mzO{TkY5bx>yU|wy&M(Eu$TCOw)R3kl}PaxEhb$qCDz?>v0-aq-0I$*jehvt6HvV08ih2Bbno@4G7^OwVkhOUSKA5}?P_N%L^PCt^B%%Ard z?gu|x0*KbQYZ^awnx^BO`N4#^}|0gp~*eHz0&X`|K-K{wfS#PQ! zMpZ=kvwBhPze`*S!wKE&b&Ecy{q*)cn%(Ri6GrNw&(1kC7`bY0MgJbGtUt6e;P0!9 zVO2o$9&aoxtu?L{9AXEUlE%DR-RvzX!cf0ZRm=8)f%}+6)VvoBF<&TVtzFy-&~H)X zHx@SW7>s0EIsg!eK1|osWtH*yCr{TGmc#?`%xcj9As`Nmqk=ElZM)!c zpJPZKzw7mXONd3+aCq}!lgz8uhs*ik z9#$t}GQ({|f)apm6_FIf{=oI$yd#1efnBb)w|ymHH?5^GFb$V@ZTKxDe$k-v-)zLe zZfDK6b>{|O)Qq|Dn`m36StERH4=YB)@e9TFxSE_d&DV%Jeu2VyzUfO=C>52kw00re zxb7N}({~N@()ALx@_|xf#L`k}y4eZ%ggiw<8UYOMhwMq?OH}Qf6OV!P!h;W% zl1?Gvc*XJo;Ul;aN}UM2?ehp&{6c`l8`pKeQ;=2Y^Yo}1@;nT|wInCgoKi|@jY#Yb zfeE%5R>!<_B9t+pOk;d8h%6H4^Cx1&^KL4d+&WM`9{BDhP0~9COayM|0!Gm1B00!A z0N|WWF5nKB+o6i-ls(Trp#0|fTp)y$P5NT!Gvc$iod3`$_?!nBi0!#3RrHSs#eFCp zsrKZXovnA5TARP>HZ^h7e5X2t8z+kR2ID(vc3ix{uIFg&6Pse6luX=wOQ~ z$id<_%MaYHuIPd{V%?zoTvSwqe1m5yQGney+VudUZaO#csYt4SQ06}?SO4o98>=Ri zIVUvE6!BWRNpoH8LzfpKBIkL^q%iwetl4CM%KMO^3*8WER~v(>?HZn#fy=*1H?L#5 zWF%2GUR+i<${~!78>ZFLU4nUv8~fX@J5nI{vnv=tX^(%nv>d(G7Bl!KGlnt!MzNig zA(KBYd4j)ihMKT+Ir+auv5V(9OHRZQQDYQit#!uD`Ug17;*d1o64SA?k|!Y zE$0xYUGjFby^*^u}E+qOD~4O-FB8wA}y5G0w9%bK46=NTOefw%HyoC_R(~HuG1r=dT9hLUb%o@h-(>!1>x3G38y(5-FS@Bzh z91kF~HXzq_l#q_oy^h`fl^FG3j3x2w47P9H@UB2$ahVG>*c@DjNn=tWzg;?YK;mDtJN3w83B&2fawzUCFdH8iC0_ix{*J{D>m32n zY_!kcM+l4E@FfdwV$1D(%KZ0%ETE=LyOAS<$4e@(OpDU72NB(jRC+8ceW{3apM(_FpTzmyW1g>BS&!s5XhAQAA7e!7kG|C|M znz?1gtd)p&^0gY%D5@*2o#*4s^T7Gf-$CMOCJw$Qlv2BI}@`xCE&S zDQkT6ti33jp=~2M%>aCPA(FQ5>Yt^Vs@LS~PI9NYOhB!noVlung`OAe(0-i~{9+Fto zEj0!Qey#+Ip!9Vw8`Y=PN5UhWqbNGrsrDny;RzMdqN!nxpfuF>Iaj5kH z*sAIknl0957(~|*mjXu3jUCARn3qSs4}Lg$r>hV`5;UNizt(=AEhH> zehG>=ZaW{SqE|c6GxVhdlT$^ks9z#-7~$7E7v5SavKd+@!oiJ=@%5XSM0XDtaz)=k zW7BsPuq5`E>*OPf;0d>iGUs*W?)pO<{L7hAxAyr4M=t`n)1`N$X_P?6 xc6)@e< zTTbCsQAw@`~RLVcd+y2A=x9RZ?FD~ zyn#DMMx%3d*>aQIEbr_Fy|WkcEs8=Ibx)Cqkw!Qg^CrP|_{wLaUS+oA~iDZ*5#s=quLV6T^AH{%hSl&%E?oaco*P-IYwQOO1lE!>LlO zz|ZD0b!dPc=z7(0hODK&+TX4E8vFRw+O_}aR*TziyyUCib~B=cozfV>g61&8*d}vx zb40e!3w+;Y!k77KKCJ2#7=x{gGG~m?3N&w1#86qROXcaEdISxG3ag`{E9HM&e&!n3 zz%4<|YdwZ;{v1j}448cJ)A$xeF^uKz1OKEJ^JXPKqLGpj**R@A2t`;wsFz|Wd0hKk zuZFyJKQ}8i*kG`;GKbS>4gCwQw~azSC-nj188pEaZu9knPI7SCJ&#!fmlbpTT;L-l zbsYC&|E*#91H~?KPfU-9@V#HP`-yakk5q8W#gx`~7Z2#zam*l7b4H^2KE73oU>?~s@YO5Wq6UCkm7JAv7NY_X+&?7Uy{RJ}3 zq?WI$6k3wp)rZ~wV-~qUL%XZ-z>3bMt<0SEm&ngE04#(X6Q)QMe@adP_w!MKLiD;g z2{AFLa!RLe+m?O5K#^mTeOxe}P?pR*1A4+}s>oM$oQ zzt3`R)2?{ASwb$_zR-oGM3o>{SdCyuX!jfk$%yrk0zDg}^E%Vx@GBodDwnmz2@cqm z?6CZ&^9{hj{&?{U4r#KR{ZM5$h&i9!XX~ZkM4P}r=hN&K?hPXhh4#|iXTk%Uk{;^! z!GMs-62fJ>B6PbNLUH=`W6wI#Hx%j7cYl!yq3S}gu&Li2r*HJ1${uOSU5^b#yIX7* zNLeVE5=WohNq*-4tvf49w%+(1v?B_>-9@-qhDc3({yLDlBTRV-NOWE{HseL-0YN=Cq4> zs42w7#!>36PJe^Q^p}V2e?RadF=O)kNCZR_Iv~=DnS!gv_Xp9ljto8b#P8&JBq;T) zAvs3{sRD9zBDs_Oo`a|x%phGctoGN=^fkI2{HK+X_jez!Ak@@+x{jbvmV~r#7I=S1 zodwerfGNVS25`7$b8H`d2Gi3wl7Ni8{yg z$Vp3cj4Ld*dekq7lpThK8FG@8@yQ*Gef^RpxZ3o>s)mex8w(vrc>FY1_>}>;QQDh< z*1q2qS=@*VM98|7@R!|K9PsV#0dss5G<~XRif?yo1&ekNr0J*jk3gRXjs^;jG|=}# ztIlc+T0py%z@yp{nZ5X2c2hB5no-u$TjbNrk#`L!wdQ|=LBmgDT4sL_j!YOyn{Llg zAHxZBKlnaNPS5c_$7lle012s!+f|bE6A!R zV=;k2UD=haee{+%vFx=-)TdKeGH|z5vPvkQ{Vo!iR71C0H8`@i~*ib{n%<38$xYUi!pcnkl{cfhp_1I!q9^wesh!DA2 zQcs>Ej1zS{SP>J9dEXsyYYZwLzvo8~I3E=SjniMN3H<5ry`lKuTVFG5#(ybnDPF06 z8vN0^eMJ(*EA@}KeQebj9ql>0bXMZ)xEDXj-b#1%`(8gz@Ng|b7smnv-|!(Z@=q6% zNUgG4ODfp@^5J88;;9OM`pHR#iXUfSH*w%SvV8+ z5PWKREBuB?v%Ac5R{poS2Xa9zW<7x9)>uwwtQaK1?=dLMAN)Uzy>(QTQM)fH-QC?? zf^>I>beD8@cS?gwh%}N?lF}`U1_|i~329JD@XqD8_dR3Wd(PPBFNZ@0t~KBJ&iOpQ zdX`)rrvzJ7pX7UU;&xv2aFtbGtK`GzIPYg_5p2Jx`@xxN<%bJ#!^DeAse})kgYK9V!mW+<6q(e8J;$^w@s_>w_G&J@(B<1v|3AlFJ1)tC!3F7ymc*aRUU zpgh{8eq>~%VI0h-PHIV|J1~8|jw{hDb~PW|7OJmHOkLS2(M7bnwqru+I2f zUwvh;k@7$HRwyQ%1f+7BK9G7O1!1)$?c~EefQiz`yY-bq%H&p6f=%*NLC4%5AKp83 z1iL0l-aA<_2W8t(1ntZS7a+vEj8_ZtcGcPVaqg%6MoY4g_h3cX2`u>&z$;79GYRz^ zg)oEiBXVg{xPXil3~_(kU<9U{(qPXS@dKQp@&x^mlNj0RfN)?l58aScB{m3uRPqeq zlPb&}!vD_dQ#sHa-)x`Mt`kz2DE~}xdwx_~PVFdy-2Z(}-Ar5!PawqdgX-SL(b>j{ z0$vqA`Yr!^lnlj`0-@=S>CgQ*{q0WRU%YKIE#;r93iNB_7mUC;=LL^YGj{y1eIFZ3193hRTjGC$z?F%Zb*r*)o3CtXZV{lZz7RfHN}ZMmk~bVG7vo;QJb@-cZFo!>xCflowa*Ng|y zp*-iKEhFebKo~vuI+I3m1I!<1ZTvBwErW$B=I`_a)UKCbK-h4!9hK9G3@uV1gt~2h zO5mJ#C`5pjRXLE{X|3S-`t`MUn|sQB1wg47qD@zTr2%M^Nc(Q^?O)n#ivo8bM0 z3Hlc=4>~Bk?5BjRnBoEd6Zr9QEcT{=jtGy1PGq23O$DWXiyNPHS12VR@H=_~iEg59 zgI@F%F$^8p~cZ))=~NU1L> zk~tIoZ0QH9xC26<90)Q7F_6iLuNS0-3}39O5OJBYN2So*5`)KSjdEC`a^urf_j$4u zmWo{Y4OhxH=;Iv7lx4521tSXiR{<17g7>VXI5R@^7)-<^VAC6qQB4Quwc86BD%qy-AF3}bVOW)Yx8oX88^TY zgoK1XTw*w)kczTd5`|YbA?*UNJ=G$;ZxU|LuXdhm<{aG%#LbKy=Zgi6ZnTv)^peQ_ z1?05`n9NSUy3>Tdhs}=NQQ3``2Uu5)@V=6tmrw*+$0DG)NAmo70qU8knlzH2A}!f z6=;6&yd3LeXxh4!-V6saHZ$G$~T(JEc`%KrK8MjdoNtT&MLpEkcbr{B(pLP76o> zED@Kizts5nIK3>kg#S6U%WCTzFi3;Z{B4GeN@0|t07!*s#yXF z|CQj`^4d>R-!+t`H)jdPe!+y@Q7aJ7!+x1M;fDj-7UD+t`!90oA$JKuO!U14qt_=2 zPoy^(c8J@o+k$LtDu{LcVpxc&LfE{Soj=)=#pvjw9Opj)@r{Er`Hpy8&_*;0o;|>Y z8?r%E*(?}>0nOCQ{J^kB84LS8&I((SI+R)yChC;g+IR6-fx6iZt3B`dmbT(i%sw5k z#Bb2_wJnw!NVD(QP<9!8Pd3j4(J3KNP^pImd+;_zX6Bic?Uq{VC3MJVGRYMc3+CAZ zK#)i7X=rFJZ(!Z@wYBje(4N+L_F5nSj5KF~z)l$v6IvARIXT{TB=t-^XQ!oOE_E;- z){e7c|6!4ZlW(c2rbaFUiv}jjmocW`(7{A_T~x!>SY5;XIT!irZ`q3^xYWQ@5d8MD zKL(s_FDcrsQEYg(KW1OEL3@uyH}yn}9o73veV!T6z_LTI-iCoe=eF-buC2_dxrHAQ z>>ri}3nQy--v25~ZlHREhtHFBVm0F3DVgLZw)~x)j%Yi1I~f^nT6*f4nk(OmzVWuK zCCWRDjugZ@9j3~RWoQrVf5br2TGMLYy#thV$<4V!6#f$Rz$h(YLDC#bGL1%qS^Be5>SuA z&LfKHV~bPUJ>5v0WO$W|o^l(Qz*!b!KOatvkAGv_dEt(rMKTD&r~HP4z}We=O~f+@ z0L<9bNJ6>+$lqi>oCFnAA!;b6nG$DJ7Gb4%Q_tWJa1Zs`O296JnkouFdaT0`D3tH# zon^m<-WtBnT%CpQ9$55=LclUcOx67}F$l{eYMuqa$1ZD}5;gANzcl5~UkxJDDWUOs zBsEYo6iuPI*szao-o*Zf&ZOmKP}1?=esw`g*+Qp~l0L{hT8XhfLoK^yQ^g-6C=Z~u zJ?pR?NUFen@~8NnxH#nY{rsQhS-cQP>Dq8FBwSMA4QrkkN`-o z(JaQ?9XU*{R-@RH!o|yYmxF|b0zD!1t5EpPWWh-SuDkNE#~9BSJZo^>)Ph#BFFOl| zk=O2La|DN^-_UAUI-~(~nAe}xh&Adf7q(#w`ru21dgPb~@?lyIE)^fL)~O05Zl8QH z2=!5afB!MdsLhkyW()O!R!0kMyXQBx_xQ!1r+Pl7Wfp62y*JhVcNu=@*wk2XN(%b&KY}RALaQXsb3H8 zdJ1|p+dncX%%;p{8Bb7LRy!~;lc2--W9s1%3Is-cv z7)F$Lf_HutWxD}}irIsLUp|^2)+oDsoHgX5sM#EL8#$MUb9Z9+7aYj@_@5&q>5eh2 z-@ICAkG^VKcedEpH9j7wk&B36*6yz$udX(|sTI3nj+b!d?0Nau*H8Nz{%5;0Cl$P3 zp$IJWX>1#HzWm@+#{R?w z<-bO99QKz?W2<6sToyj%Rhq_R?&=SBuWiMSk|H5#Va{GhpN)HdBB+zs;18H%r$7OM zS-iKnjpLV+wpgyWX@<*d&2zz6;PJ=@E9^@ZP)YKIPey~9atJ%;Rgrz^ghfe2%yLqO2v zP40%q#^cpT!*8Svj?+8INZF-yec8g)15(IUO75i>f!x3BUj~Pb$hn9$=lwV}q;3?} zJyi5Vdz+h#GmHibq;$p}8}GFZ zC_zz`(`a{%XJHpUomKKG>gAf(N4my;#n68rW{;e4w44l^7<;%biU(N01Gmi+SqUP3d-LXT7bLXqwl>NAkAOK zVI~*Ldn8xrww3AdGM}BpB>` z>$5#nhV@Qia(#L`iKgMG{nMB|NDEKi zfH8=NAcFYdn?ZuLtA-!IQPUzU3m;Lq)b&Y*t>8O>AOX`N<_WvC9MI2NQP_b)&;2!l z7{|bo%qoXvmt!*kUCq`W--zeCKudJeO=*?+lS}ftah#QjLdtnU&u7Ps(_L=Gows=% zFVhk`PqGDDesSyui{T6t2-iBKVivAIZhkIR-$rS_Pf)*4i6j&F_QlkdV}trx%@)ZE z+u;if7^9kBxo(J+JCch#1`5yHSpHQ?-cZ$OUikR_1i8AHJc|H!&Q6Ms-13ADexnRZN1^?|B+tR9G7* zv6yQL_WJ9$DrpCgA}>N0$G6YgoW_5i-@$YLE0L5YwmoWu2YEdZ^JvR+C{QTM7Gmg= zA@71)fB5eO5}&ej6+|VkYX%o!!I?Ol1b3*v{Bh!tj7HA;ZNW`t7IeZFxS!{D+>Y%= zo8lpLF52DdgSIA^Z@utqi4~vT+Eh8NR}ZG&ctdw+u)Zp7&zFP7UF5~P8S}6hPdyoP zjt{$C%^K3zW$v&iihtn<@ zbbkqGZ~cx4KU?(LdiK~9l+U;%EU75Uo09AoX!4+q`Aq<2BWK=Eey7#9a(PB&v6}lG zyb9$QGa)XqG@Wru;daqC#^&2}9Cs+Emp7o+Na(5vvOoF#Q5HVwCawpai5)F9maBE` z%+2YrkX@X1oGGhWfT3Rd1$eaYf};AOkiSjhh+=caxBdn!mEwe*U2}fdRy*1)NamE1TF zWzw4lR^WSG>Cd_@YFbq@&srj!IQsLaKKzS6_N?^YmrKD|5!N`t-at&>Ui_OYi71J@mJ$Lip(*( zHS%&*dJJJmex&xvefhkMDJN3#W38w#{#541f(1NwVJcsy7?&2*ULqrSpQ%`;im69z zpONRB(S8O>BNCOq83K^S@fO6lS+lHE&4PRF=y0d+1z^2Xp{#eRdUaG5pRqrMn?%<@ zwH^_UWNi*I{eF^ZXSVv;)2}y%^9?vQ@DU=b-~!9hcv8E%d3u$(P^pZBGhUl$*sj-a zu0-Z=5y4<&I9H--MKj=0U<<{zQ=q^5W84xmR^3jK%xlu7KhF3IY{Y@bRJM!G|gxt$mxi&;KKCEdI2 zb$y_TAW)U#SC9^1WG5EI7+9s}3pw=p+F9q08sTHN{P>*Mt3!6hqtO@649Q#*5W6Jt)Ys z0=9Q#>2%`rdc}$;q_e*1!XxgT_qTyYp66HtZJy_8X%qI=ubqb?(g%3OwnPjkG>&%% zh*fLc^2FyF*df^5Ut^{y6u{vH;chg9+r0ooJ`bVeCxO;)k-d&Hye6l@cr7Y0^)QmW z1B|uxU|&|WQ}01nJq+F$VB8;=ZL_#bsp z1>JMlJ1Bi{wUO)pN;cD)fmJr6-0p+K;dPq0fy5h5lMV|V3Hs;hLw=zX%xXe+C(}Ge zIj!nKe8Jt*_l5}^M$7KCKJHI}s39#533E=IM20E%^QUU&&X3EgJxk`;qjZCO8L8}5 z^XP!1_JsA z?|BMc%p0rI6v{0Jw58Xk`@d(z<~y{P;tZI&w~m1D zZh+-!{%|P&mBW!7HH!wdM|1>qOU>A#JnFzF!IAuQSZQ`568kl$-4t+T`#LrqM=jtlYnDJ&Nw`!eGo;e+h?IvUd9 z%E=40s#!0NSnal99@|4Im-n)u_2*M|!(T|F{A4HO?yl~N83LxWrN7TqG+H0b+~s1G z(=WDEL;P3vhXwZ|FD8N`>o@IQ)ZV7)Dp2PfBAkrx>3-6Q2ZW18ef@X+U%3uBKmDox zeWgiIe*O<%sU32>>Ky2^XACb*KuCqB5p{I)W%-22xLjp>zG-?>KB_`}gj9Y6Iq>H0 zN8N$Z4_%qS!(wjfnQaeLOl1C&Eo$SWt%>=wZ)#gO)h(2K?b?t8mpLVGx}S_)Py2!& zd2`X7gwsC}R#|7((vNR^iQ_L=WYsvUnSZyO=~ihyq9Mo<+F`|iBGNO(Ou>{H=g}CM z+ly>Z4Yn;JGXl*abgs@9SJQqbbjoQ#x5dvi(4l9__L9nU`tSnnK0gWbguG&*6bh&Z zy}?XmI9aI7NQ)e#@A8yfNcBPWYBy6Nc#u;_ZV>RQ0C_facdSUqUH$97-6fz4MdhV` z|keC&rIgK71c!T zDN;Y}*9kQfGg(x|GHjT2yvnJG%rYO1L~ZC5+JDXYe(11^&}{RReto<2V<2g?60Jh4 zv?%Y79z>s4EN<4-tl01A{-x=>`ycm5q)JOwu>m2f=diz_hj5qTUcHIDd@Q=)YYl^J z9vk*`{L94*(|H9j%B3LWcQ(6-Mej3GiBM7Sa%5T(I02l87ep5O0Njd zYFMV;c&1^_(GYO?-N9S*q_w?O)YYU62cu$TfnAJ^DN%e~FkY*!oOExEZfLe!nk_ps z-y>UmNif?L`4H_ZwFvj8UoY_S^pur&f~5xnTrL^!AA7ZAjNVW4hju;MtBoYF^^4qd zY~5Ac9PGbA#L@(?SyO{hGS25XuauyXJ~2Ag1sI5uclpvf;6X)RSGdmP}6GT*y0?!z$%+P-qYRXojazK}Mo(nesP+3gZlQqov#-@kj_gII_ zWW?{^f_RPmj|*Zd##mZV67UephsJML^H;d^2zhGvREM2@^i@5D-OWs@h#lFU4CYQ7 zAsxJ-lhEWW!EYdwmY{E1{N*oT<&l|9O5av|wPOv)|HDqNmzI>3CMrbA7BYZ$)BSMz5r9bs zEDS9w_WtB{Ha42Thvxv8Rp`jK!wIw#%PtJ53t``5MKG%l41p)ytGok$jLwU6($^C> zgX`J%@1Os@BMYj0y@B4O58QrsscGJdxa)1NIVnrEv!-4yPw?V==2U^79Lvc6N)thY z&j!1aL1fg=r>s+oy!Ggm>OzD|gi`*u$Vu9WRyl3lWmJF&=9qOwTNi#yrTDS(bT=G^Q`5>W%4E=N#W2l&8g_^Yu*FIffWPpVa?u-F96@ir=(Qh2)@FIp#DyqqPY)L zxH=%rEi!iyl(&^Y>Y}XO9s->u>wUB4hu4&BH^pxz<<;~KNFp>H9Y2xRee`MTbo&Q; z@4tiI*@ldkjt-htb`4as zBuXH&Kz16mDD1cpV)ax?F?+ys@LCUJe-IC8Poe9aoded2N|%)usE^I4{#DHE$v}rN zJG-jcMybjWskpkpC_xt#8C#R^`FCjl|D1c!BuB`L zaSIKM1f!6qmTZZHDke`0!v5FyO$z^C=1s5~o1xHs^zX_|+rSCGV!9RK{|SZHD5ze0 zf`do{@LCdUJ`1Kz&PzNK?Y{x6&lCFR0>T2cS?UGiHB0Jdctu6IX=#$cBvp!lMyNPt zsjw{SzWEH$Zz)<@S_c3}EQLL^4Z|U@5T;$nM+&YfWwjr8XYdpyk%W` zrTIjC2EIe*##b<`;O$I#*X6z?eYEnW4qfz%^$702kNfcJCsS9r(_96OlHo+`<$L6r zFCew835+8Am`QD{2~v#{5)x} z-9gBpcF!_!-9L{;vBpaR)H4h{SGi;I4%if$U^LdmhekQ_fgqw^o}Qji>%lb`K$qQI zq~8yiLYweTz?%xr@8W5QN`WHxjTuCpuQwwY6~>%Ch0CNup=-xNb5e| zt}*KpegsY>d@3rpxhTGEf8Yz6u`>L|_XOl06@F*y^1ptGN625Sxb>(3AvB+j&C(5A zl}_c`>MLN^`6)*3I+ciXPOwefvqLXMNF^Q`TWi{LIGu#gNv@bNmQm(Nurs_dHC5C; zi4XczlDXCtGi2)o4hw!#V@^l?k^wP}+HI^o2BvyRv>IL^pU6IyQ3z$WXRiyKt zObl|tG@ui+NsqCMTkG&6D;4sL%-RN=?r3j&+?R6*RoL@@W8`q*44Z8OCh9hl|A1uB z!jz&9INKwFBg3UhsZ_yaD(3EtGprlC zuC}}ST+?iu?j8QnczO6)gr`=SN{{W%pAnq88t1zs?+yW7@up_mrXG|>GgU+pyQ7&L zDuBo@0pORGt!)|myZ7%EsgtB|{{l8{hJljAKOh>(OHwkNeg7zo)2vewFxRDgP9PG( zWd)f_>~<&NDG{HAwmB6j=ahixt{AA}E)pxMmgzwTSw4*xxniKR-XwpjwUep#O2F?Lwe25)C7$$6~P`{dB~`SWw7bS@E~@R<|Qs2smNS?CTq`YjX#R&R$-SL z2bEzQs*zjOqe_WO1wR5KSM2HMP2Dn8k^v!1_d1!+<4sMx*(tq+-~#I^8b_iErUpYR zCUO)6T#>t0!fL=s*4VS`Z+?0K6m0m0HDtYUxk5!LUYX8zs;5NPlVm&@8$Nd zSXhKAgMSwe+5j(+l;j2cBtEwuDiVGsoQAT55@ZJNaCh&8_r;Lhp$j=P{iQM?i#qKQTP{#ehX-vw~+?EtW%r3jo(J z)hrrUXCZJU;#lK_$^MwkbJBcAQL+SV!A=!$uZY%%fZPXa55tMGUL1_agE~K7I3Y$) z$Y6=df>G~`!-s!vcT#O_1p9>GT}?%$_5VJlC_3NFte|E&$y@rI&UP4PeR)kGn-;y; z8So;aA_TzB$Yx(-&^7}6zdnP;7oLZ1G&#H@$BI0dERfi+cSe>^j7bHnYLD*%bi?Y( zras4u-zuoYJ+PpCE0WzQ6elHyQGnL#I8CkWbZI>}FBPqlc=Y{o+=PA!xG1ix?$;Tu zUwksrysNnE7ilAy=Q33Y#H~|WmQ=ywcyausKe}}_|NA3LgL~__mU_w2*Q!sAsB_$S z5Y7*-!Eds-%+;a#Bg>x^9FLpOLNSqHg6~~P`)q!N z6=NnjU#F{SPX#%c&n=<=YSn{h4S45;(T21sZ(g7LPTPv06ruN1#mLVDNfGZx7S&*i zCk%st8}%@et2=F;0|zp!s;*uJ)Cwo4|jckl;Jw>z!Y4|b1P6<3<> z87Vp?%cjC#j+TF%Qy71?h&YB1{A$oN&C|@}4(g5FT3VkrFeBwIw-Cvr+BGXx47&C5 z`|^Tp{=opGU;&L~UITI0ne67DX8aFbLHToBzSpRZ=@sgaKYtgwOQBP#iz(0Ib$`rV z8R|$~(REbm`sDX<9o3~4hY_*TKX5yPNHgGmBE3%9VAz38;KH30 zS$%-j3_1$$v#;&m_rw7~s` zCj@zo>gznmdD)o^;b7h3ugw)}105btqGL9-_X6#-pgiSqnWaF(sPX5WOH*A<& z+%t7t*xh`}J<7$*1X3Z=vnL09UO5ShmavnER;$tt7_*KYt|#{b?* zC<;2kdp+dFboSNVk*MVqjg>{q0PrM=^5ts^B2SpY(15zzXU zM##4C!XASdSpq_1wTX1Wp)+}urq^HX*e9tMPeX3jn221}eo_g#_a?Q*{)xe@)@SJf z<)$)V2GmD3HVUBLk^u9_bi@+V&&bFc3DC*SK$Kkplv}ZvAf~wpn7LEVzdV~EJwJX; zuL^3IiBlI5ztfWWDt)q}?LwVOtp?GQH#g8>!XN>L6qkQ7p!5`Luk#xy9ES{dxS{BO z9kdIrn|f|;E+r{RlYEmvy#^TBD9(=9sKFq}xoxTn#;!!lAEhu#zR*yL*;Ba$RLpV# z{T>ZBZ~u7MN7#w{pb&~6K&phq`m~C}pJs~%@iQC{lECrPG zj$_MSrFWq6_1+V%S#BE3SNT?>`Yuqn!o*o=ZF!7qu)Z1X65qqwMK#w>0=7+nOZ(pZ z3@_VxrJ)A_q7oZNjbB7-VHVl~I3zMydD=ye?){>oB6Tc7niMiVM_H&Nnf`jI(QaZH zE*nk54tvBg>dT7~lXE{KZVm_H1FooQH4P2@1duki&p-lA6(E~QhfABfeZW#m0mftH zBY~ZWwK;6^b8tMrP?qnE{rh+p=Dj&)8KN zJfIL9TKucQW`6k+9Gk6$Pe@aM7AtEdi5`9?$Zf4W zoe}mY%kTff>F)gp=pA*tg_@b2mMXO>cbp199M5u6`&J;fAR1Q-2+X6CeFtE1jKckQ zoOA78T4K|IN%kZtRcog5FQKHfX)5L2x$x<9XFjFc2tO$Au~N#(T36F#d-K+ZPQPRay}hndjqb1Nl^Qwo34Tl z+63)+PW7q{Xrg}6XeSEF3qQHu9L_?hIbQf48?Y!Kyd~noc-*MdN&o|uNphcw^7jc2%kJ1nSxqg2auN!cfBUE~tHRXV)$bzeJ5?C-PMP$C~(bq;e>lEh*uOWx<*nj}eVX(|w6CGq^wUJN#WEGFTY%ic^Vm z+4?y`M#a|KmS6VxxY$s+mzp6e`k|r(@FfaEIr!z(^KM;qPXgrJa({ff-_Iho91ZtI zGi-!_*Z#=jf(R=-;StdG4d#2~1&(=MxjRq58QcG0V&nB=oEsiKntdGHp4Fp|XYQc4 zej6;0I)o6}0r0551lS7aszZFXwzgV2I_dOpq+SMnx7kunzVAEORSQ~wc#QP?~7-=v4WFb!nvaDH~zWXYIf z%FBKJIf7{t=n8C}i_W^9zltW3Yqrc`YAM8|6kRNYshFH{gCzXs4kF+Gl6^7m-^n7l zUg=fY)sZ2ocMk{SpM_Nu9HP*hO}}1BgiuUvtonEASR)9iE?tL)<}ZjAmPPHG3sP zN|p!wgP6}zx@NcMYUc(B*e;Ln{^QUa42s7b z|9mmNuwR8-(UmNmGfds*;v5>#a#s8jM$`fR9f#|6G z{J7wm9swLk5GKR7tsOhjNRva{H*=pfIZW?;iNN{!8AxsUu%(UmOytO;ra@ZG2fZxM zhlib1iP~&PJjf5Zx(#gi*z|i~J204$5Y41D-ZxM6^d|w)U5aMh-A+J>HZeO76n(J5 zWu>J`HRiL3^OwX>_4EO}Q#P(1-1e5ceIiY&q7fYlQE(ljC>(2(pPXf_9bp04$lwbPr7M<9dRmo;o400#57J zqg~d_4{`Hdcb9F3TaYf?`FuNeA?-TmTl(|4Os2xIe)RhLhnqv?A%YLnkEUH66E`Q^ z9@SU)Gz~(XXXi5(6HRee=?b5TWd#+|D4bW4l9QXiCn;Qk+m89qU9A_;Vk|s%rD}1S z{YH+BK||7-!n{X5g6_TF2k$h*E8~eybr%*xU%)OAe9;-*1xhmA`Czt?|V>zSDpG9mX)Y}erNE9?@Vq*uaACZz-@Si zN`%}@rE>PfeWO)0tE6Qp$@F|>M4Nnei5Ss|=x}njuR*oWH@Rmu>_!3N46l;ViJwc# zvnw=yO(ZG@jgIWmZJ#ldsA%rdLLHZq4LE*ULSFq8XbB3v{-$ufoyc`0D*gBYWOGQc zhZHe@^~|bycwM_Od`^7a)xl2`=KVCw<>O0z3JNBAw7~Mo_f>ILPLx@bN$-BXr*jXu z!XH)F;Gc5WRnU$#pHsYcSw0L9=P)04LIf!W{BGn2Z5W5J!fz# zsnsRA3Jc6~roZnjx*jT(OyG<-e9cdVzNF?maVO!=2$$KTcP-wNWc|uBC$rr)fMml+ zX%)+&z}~I&0FiP?2hhCBi&>Bz0aV)hoT!u8aS2QUM%IL!wg2QKDK)7|5_32Y9U3Dy z9Id9q`XHwO!1T9XNUBD93LRIcWylrelOO;VW?Y!RQp&YygU zhIsApB-=}_iBL{+QYnfbL2)T2xD~GYW*$by*c3{!PfFjHWyr&T4W2jn)}7V7Ta;!v zzY&q5?+u;8DF1ZkI;a`f;{yGHfS3Jmy|2hxYxW^`@7c*r66~MHt5pd+2vGjrsaTgr zjOBC}9;+}_r@Oo8$#!Hh7bl9xZr3?cUHjx+l1@g%!YUY9 zn^>Rp@KZ+V6_{620O(H<%($!t09!&(WTSm#g&>6AvtQUbwm4&X`~kHOuw@TaSdYB* ze)DT-gG?he($x=F}%t3y~34C?7Vv*{PqLSn1U)PiZAec zXO6Irz1o)sH_exl*c^*J@A(KMCEoRM(bHNrWSEorq!lG1t zVLLwUyhOQu3k^^J$h%UWM4HFs2Wj#zz-f!EgCyIO5B5*KePEa|*gtL?LFsSRP0x`J zzP+)WD+&-Zy`R(o&g~%TnRP(cd~_Nh1~bxZNWIrRTqo8;A6)J?{ys59xn#TAfqI+N z^M2n{qBEB#rS^M|rHIirT1s(6iq$dmVH>Sj&~N&*!sqZ|1I2ZXdTDVxAl!c%h+*nb z=CLIHNJb%IybbtI^ir*#e2r}->crum>_4bhGy&iSlraS&T$0G)e;z&dHQxSd_F00` zuMjefx9}ISc*gc4MK(OxYwy%3KRSM`8a`6sqAUkSjv09=QZD@6`l^!=Mj6?S3>$%X z>Dp~zMB4MP+lpB=^`|3}1?Mxw5Q0W;Mvt?43R1)HiHYsMw{w8AK_xCO6vW9puC~_C zG4neOPQnA*!o{JfF)(8Wsas?Ak`~Wv7nx9xm$gJ`s4*K1`U_%H}2%JJIB>a zJxeR4YLo*U`d^a1h8=hir$ zYwyOQx4f~T^|UgY5{&tjl^X@M)@G&W{c?vIILvkNqpL*!odwW1O)lDRFg z)KPhnaFNjPhpwJbkV!a_uq~Z<2awp|dO}_|tIe8Fc}Re*E#uZBLysX%(Hl>Z9*j|M z(ZzOBGBP$P+38#m;PtHu85Eix@#if;R}%s@V*YC^5L=9;H-$j1$9(FIBSt$avL>WI zF7Pi5<=4HMQGMuZh0jjwg~DGmW)!~6XAEAvm=cwNHVMQb06)C$N8>UWRQcu?wi$tR z0D^J&S&`cEfDLAh*O(GQLPDaoLV>0=9bad1H>q4Vvk8rLGY|QdAsSrI*n|)fAs)J-qXN z8S1o46~C`geFDA)M#c5~n026oa@xE#M-_aIU+z-d?OKOGFxj-dpg{duv$s$|evVMwH8p!1{uT zj45%`Pv0d4#%PGEB_KsPnh;Ow><0?Q?%(wZg%)xs*lQ125}TtgTt>z3GS3S%;0`_M2WeiPf$s^Nb0!bywtA#LvS+ z?+=-lSj|QyoptKW)(RtD#S0=TU4nqMna+SKYtznZFwP_HlTO=C{@M6*0O29>)$XuE z(^z&$7hKm=Di8H@yG+WD2mnRo_rfETqmji=@ zCmPHhqKmI^>v{a{&f*{L4@_?Mz(&S{AvO59*Vd64Sfho2NfbYVJPOK1Y;SL)%V}$A zl?d2*dN!y+^>@c)E)8v%5mh3#-)b5=Ocx^b#-1Xi%U(Hm&kN{b-GRW8Yw&bP<1CI( zNd(;s>EjZ?2%NUIHYnV24~kYBlHdgzrx-hguj%2ff{=|TwH$s2#8lv^yM3xN?=cI! zXFJ)%^D%C|PjyF-uv8G38*u~>}2pl&jQ~T3n1EAUft@%Sp`Sley9Cfq*RR! zxqW1dZYsj@0B;2NQf5n3CCr6Z=$^zbI{_#+FpB_p+Y6D)_urBLCiAV5i!>me>HQZN zwqO39Ygz_*^N!H0br#olrY}9pH+eh%gIGnEJML1TqsYXX*Zw0G!B)(l;IS=l-V;HF zGA7N1QHq0{6A5KtvhAc}$KwfPOa0@ZPc@0Jh18SN>VTTp8JvI^7w(^oW+5rc7YfZM znBMXSI*+&R5es12(r9`SAe1ygcHkkXLuoqgrFXExI&tyvpizQQ&bPGn4}AO|&}78LWAy`O%Z)lc{b` zhB641@W+hW{aQQ^tX||pwzub7h-WI;+R(sFDDAq6$!gN+{|4wm8MYAD{D%K-=2!Bk z4V~9Ui-NseHo3pkN$1YS@ut?*!>nT=8wWnaMOE(S`kAV3>RjK3>>rocMCbEWnPTjP zzc41u{3i!KR~-NXF?|Z=C?cw)#;~!>xe-*%vFyO{Cfl^o=4E%c7IxCSK+la)XDZrR zg%S2}fez~yk~W$)_VOu|ZSON#A+T9OOHKx*P}V_ib;$%Plk-o&?ps`x`$16t6$lwR z0FWaeYIr$g3XRMQ5IocWI5c-Y6CrxceE!2F9Qax%*$V*p2wLX=D2e@t{2V!$C?fhr zhy5hyPF{ItEuqx8nafS_HTjdIqLj7mUr64X=_4ob%;#P6#Oe%w#6XmF^;@G z3MBK$%?ZkRT5Wn)YxaVX&XkWKA_8WvSq%;qlV?HPuSEkqOKf4~&s=sx@TR_UR|NC~ zL;KZz55&eN^~B|J&ns8Hw)kReJJ(8WHc+QDB19Y$;;nqAT}~=~Rz&y1CH722QP+Rt z)dd3OU0+|Hr{zZwlM6NN!b(*UU-#aeZPev{2>E?9uY%jGjJOOf7c;c)Q0jgFczZOo zzV+1z^~V@Fv*Aet9_)d#x4erk7LJRS;qOrrCV-1=RGE4IcCa}zPH#M25D_NXsqgQi zDK;)HZ5;ttDfo|cKLwmH;X%wqd_>(rk#cSceK1=idn#9guvD(MNt5PcZVQ6#Xr8T& z)USz-%~f;W3Xsi{eD_AyC4jkFH@fPc#`|4fe7l<0(yA44Zl}7F5-6r^X!_*v{7`Zq7LZJ*@EA9$jZJA0j6DU8W{n{sxE?_L?W~^ z*dr=S!%U4PMZkH{qREG^?u7N%1%$}j`|r~uinWk&pM391oYlDA{hm{k?U?HYr1zKC zLWvmxE0M9*LJEjz^`Rga`0)aHH!d7I>wB1T`n$bLC)7$0ftUm&CH@U*Ni}k)cp1JD z!<-)g*a5zG%TT$GF;pOKY?lJb15rDwD#cta>dj8?&wo4?P=`lF9}nhKZM?W5m$zs? zUJ<(f@yX97s17L9?6->0U3MtvJ5WaA?%#^$cRwNtyGx01#+kx#v&^-;a%$J_eBIkf zoIm*#*~EDO!mHxm3+jC=y5erB<1}tjb-V+eY>F?4y!eGVzZUKv!-VVow22X2vlN&u zO)i6nFBN2B$<56@2l8$J@CH+x)qI#d$W6WOw+l}p>N{F$56Mg@BgAk#SV#JQ9Z9yA zjP3PqZ(>YB-oh1+YWrVVZd*P-Lo2^eiZlL(1_R=r#H`?vuWi5-g8#1~-hcIuBI3Wa z$;YI&&i>?G(~j_ubgwGLmjjsi-)nIXOT?}Jyf2&PS1;pw${2Ohuf+KEY2DP3tPb~n z$jy&br4ObrJugR18?U=umcmz#Kfy^6E}9-;Jg0O9pWO5U^tWs;o2MDQQ!jWTB#h3P zIkigS(B(hAKK-nh1@(za1r$0Qd>Y;bi7u0`fF89uw$}jx8ODtvrT*>h&FO&m<*{7W zxSozIakGO~4lLXsAGd;dt2LTE#2RXMmkqb{Z0G6-r_xp#!%#Wpm(kS8kEl6F_Ffe< zoVi@!84H@LqhTdL$<7E^s5BG7vHkAVT3oL+2r@D8r&zI48{;i{@_Cr%Ldu$s-5EL~ z0Af7&)Ezpa)NlGL-ESsh;A3b03U{){zdE3KO@vwghNGZZdb@{wE4x#&v%py-!$M=h z*5;3_b8xlLj?<2`j)(R1eLTDeXy+O`ciTT^`~8dp9dgBfAYI(N1{3Jg~OT#ZdAIdgH(y9 zLLiU!pI*ukBls=InTT>`38g;^i@}<<76*m@ywQO-GrK`F<(MV}d52dt-YO^R)(OtP z2`&lFuJ5q&+TSa_Xqn>eo&PB1N4nUG#ZW>~R_8ZS^qsrr;7QL<<&xN%i#Lsk%0QmT zlaut)z&K+6N{k~`tLEGwiXK5l`@I4%8eT27p1kSrmzGW=A2y?=PI++xKxiICa5edO zV+`$&du_HK%k<{OTjg?xGN%-%7e#-Jx1qN=mMXDaBX<1Onli26x!MGXtCK^LNo_ND z&v$;Bzx(!NW7_sO68>zjXb)pVRV2Lk9Qu)syeSkT8Gsy#6!iFO;V?tr>|*@g<9;G^qPjAKq+jPLvi z+ydp&z9mGE=P&MgNL>znOMR`6W|%ZFN865&>?&sBh(kxwy@ypwRl4!+sj!FZNv#dR zrbk=)k90)@a;Kk4-lUGX;x5bG;;ezRiTHMYTF$`~LtT3r%5vLqU-qtF2Bi zAz_ZVCE|U6Eyef|431E*%wqGvfM3lM)O$HT-mM&jnOk^`rk`%g&mkiNTWNX~i?2Cx zvHlK7+2{Cj4HV!LytuT#N0jQMIhwyjt?PBfU){#MtSajH{UxP*L@om^0t5(t>-qcW zw%i>eQd*Xo(y&&k%>?qIpR47IwvqF-y6S<0>Wf>e4U`xQfbNdV-(m%ARygmkuf_o0 zW}wY*^QBuK8^Yl*TIP`c#?-gvZ1FeN%SO`V!-?dxZ(pkx#Nz0p=s>)n6VWk|by`~m zW8{zIpMsgRgMJ9AYR?LY0%qmzV!q1l?(eC5E0X?y7<JMO=G{NWfn7`oP6bI!M(xUW5I zR)N=^ZN1w*B4uF#rW+f3#v(SLor>U-p)IrP#U+8Upn0eXY$j8Tm-!Qbve|qTt>ICWg}>|5L>p8+z1Acto;pywjv^ z^nORdI(AZOyztF~QRxV4!ZhnQRr=wJyd3QOBDYkZY zQxJdyLJq;wGLw>4>pr?TQ%WHuikDeCQgz)Kp{znUl< ziGiupRHuVOfcsqKNW2v9M+Qg?<)bdesY0Et1-e*HsYxwF8U7TMIw#eK{0yCBJH@Y( zl``BIe#nvEt}uxUxLngvsGbhUnn(oK_!89QU9SJy^7$a7+o-Lp6txmlQ1CqCqww^k z9jb#>FW(}U4O4t|6js(B<_%+NumN}HAB7~i|kOv+&a_|o-gk_OD9piuf%p%?y zAf$@Uh-=K-nvdyonhB^yWFRMzr6n^QT-@Rh-(Td=zPPeFIpq(!w_wns0^{BQ2=hs( zXty8FOieWoe z{-=(J^7>5U3HGt#W!B$2XQc{a4AjP4IEO!Z{_4a6PNsnhUKK5ib#UZt2{};`RYty7 z{!|0sc2otk?Z^+BGHBrixSk)HH`w8UVQMYR5vCf8fk&yb`~>{sJ@1GHNK%`+EYQdp zaeY>rR6F17N=5JL2h$xR`fSF#jUPV0|Gi5ZB%Yf7SnNZM9AAEb{D*k_6eQd{goDGl z1bYB6;=md;?F$F5Dc+?I@^+~oX8&=i<{)qqrCea{Kd91A2*Pm6`;$_V4%zp>;!~Sk z(%bHjf~TYnMBlg1`L^zFeWB{=MCehqs=i`AQ~CY3*z2*Qfa|C-^K@bUuIy|jh)*-D zoq1`sb&oD9<5U|JRets8U)SRkeA;vp#dW@BKlK2~)5AHl6c@B%==ypz9pSk&3^_t|H(wqOO1oxPJGBIw5P7K zyk4bah8oCDzCb#(B)aT>J%HnRxsWr2`#8o+ZpasdG{;>7MuL3Aj2%Xnat}9x_5}Z) z|M1){uhhZDmOSE{G~AXaKJmEO7OYQme4?f@z!s_j{6wnXqZ%{TEw~Gt3?qgg#B*D0 z@hDDf7I|^oQmvS!S&)~<$&eci)Umv8ydPJSH&AF&M@L5&f|0}&fbhwHHoQP`n9ojR zH;4tLLp}iMG0@*1UFag^Pg?CP3d~h-%FB8IIvMTW!+#9_?n-DXY|zrm_pj$G0FfS; zvqXp(EQL|_BIl0TUYQIPe_PV7b`mO|9h#f|xUr`>M?rO8S=*aZ9$5c*Am}@uZGU;68y1GrhTU1IU#=!6 z#Ksn@r>{S{%(a0+ID@}3381=VV3M2)@!FCK>ui$VhY0_Jk^wbGh=92q_Um!g2R^!Q zAKVIH@3GuG20%+`yXmV%F+}7RC9>^e=Ku%r1vhx5z_C`Pf3NfodL_Y#E0KTv#&HNt z@DnjE_=py56$v|XV<7Us2~_Yl zQXlB75P+e8AJNkIeP^m~QBli0Nhm?6*#gbzB8Dc+35J$wK0uCe!27GYk7_UZV)sRY zbs$i*Jp&+heUR>L16J*pYev~Eyl{2lbxw}DpP%1JRM<*=S63IT5R(8Q^phTpRWMY? zH^lsbV@rvPs)-4u=4$~}3xH@pjz|;qu!wv2wZFgqS^Sw+4>Wr6^aHnTH&zicRhzyUM`8jWp&yi4>q+%l)R2;>@ zdJ5$Vl&S%+BFNZ2HIi|eCH4FeuoX~4yT{5NA-L!)!FG_Yp4Q*e)MK@x-9(k-bu2Iz9 zE3wwHqqruQ^5&7T#6W~na_0P_MMSNB`NE*VdsJ{7me%fmUuH(E#8dD)++H0EW#Y>b zme6C#B@M>fz4wr9^EoU7sn_(o#|;n}Wbmcmw^u|mA^l*8lJwz&FmTRG!>+4XY6MZC z3`|;D#T>e%1AF;MfoH*RiVPCjv@RpZv@lVb#N*O4I z92SZU(=u!=-ZweEDw}D26i{VaEfk6K2^XQiE4(uC?Si4uT}KUDhxO?6RA0#4?y;ix zi%*l0qXM72-YEIN0g5>?mt##}EA|SwLPaC(m(=U5XNP0>S1FiAB3{`z1ap-E_ z{Xg84@koR%CRyqM6HHjFu{JyOxp@VO17Rz;CwOn>>&RmX5{rd+In|T9A4b9k5;o1# zd_`50h8;^oU?;d`Ep^5qy-5=rY)TjVG$8i?HNNPBQA{Rp0C{HEyCo_O>P)KM_vj%K z7N;tQ-SEEO95zd~Rb z5;;D<=D}f7jrjE)`vI4vr1^aSPC{+y?`Wbd)C2%gEtKq7UV@}Ha-BLMw(#yqfaFU{ zO_c|uGag8z(6&lVMTG~Z0#xr`0=%msv!>!N=;K!aa?JnV4aA2cBNK;NS1OQH-;#p) zzBLO)fv1CqUb!estiaz~bX@&^jL&RY;de5dp(2BS0WXR$nq~l)6R~h`#0LfiVMp*v z!~;MQdq--zz}ni{3}}pwjE`pw(VK^QlFCb=jrF}C51?irP78)CBPb>P1?aJ8#%M2p zgND98Ao)A2wEIO#>6t`@h5$%i62$yzxcLg&8jOT>3m;Q6=t!k4bEuN~CSo~C#9 zkO5Y;sc`Ea1il!e4bTWOuGv72*?U)4r!$gn8PZD{59KE)|Ca{^4nG(mh1;7?1ixXG zrN)W@@|h&)DCj*xvEfj$8E~ObAnLCHEWMJXzDYQs;|if;V2rH0yT(%qYwa2#l)*&G z5g0yTOjwnU1J;@z=-6)l0Mgg0h{1SW{B)Dv019qtKnyDb_IUL(^|6c4Mj8G;ZIt*B zdx#mHa906()lBZc_dn#!2y$tsS8RYfDD&em%S*|Gefd++TYvy%lOQ`~k@Pui*}y*` zmfHnluDm1;xE_Jv>y32E`~XB2)d!UMK$m_K3uOhokmf^vXKKEbALXMKv8}Wq(#++wEl2>K5uR= zgLi`>wEg+R0z44S{(Xf2CFzsL|C?gQ8s%Dzc*C&MI!KuN0!^nbu#0nH!PI2jED(_rh}~7l+2k2LKs@5wkko z)ph}pGpay^w$vZ82mp2I@=>-xoEL^{jwdb|7#V{gIJA_K1hLXx9m=sspKd5YKxy26 z`y{WBa43l^WGsZi6%c{Z+<@z*tNb<46gUYdr|aPvmJ9L4!^2DW?{Ol21U$8ZHbK)& zKLb-vSW+@?mB?j;aE1^S_o^bI0|@8~_?_6uPl1WK9+)L<35BiDU***onO9&60dVaN zKwHJ98VolBa~dPQ28SE_g6Wf4=}a86N^}FP^Y8wrngWi!zOiv0W8KLgX`sPXOJ@R7 zIz-ST3QX^qq#n{h6$&C8W^Yl7YyL5LBYB z>;3Em`9?~#V1A6XxM#xW(8s*s(oYw=`F(jbErf!_mBMsnE}*9ft*9^B3h2R>uNpj z44`}Wj6E0LtbSRleIk7WwpZ!6>jFvp5H$+i9?I8&R7mDZ*U@d{G5{f$w)->@4+$^1 zqtf%xzJo|;&c;u(u2ETWRP?%^wjV3W1(5XN;z_lywVsdhPfUXAwvd{SW34nEDS$@> zV97;vYmCn5wBZTDcb$0_!G}R3EDw9gS)%XCY3Y+8JGzYO?R!eQ!0e1ClXDX6eBTFVp*>kf&*1UG^+3f*K*UMpkp93>WQO^>3 z@dQNFXF!K#WKBSIS~phB%E(~0Ke`3XL{BoX?J-kG+kO9;DS$~CHMAM=$IWn|qfGnn zLM^|l2vcTn)Fx_=|J*@X_Y#<;@=1Imkw=n9y3T+p9D<MF2bd8y4KVSmbc0kf1 z#Tco_)dg_**@1zf`a#r+U^JEykI%yd={qN(Afew>=0)x1XVa$K9V`;~G{yV7SnDld zF90`ZR@dUz{R^6>>)UA*rP>fUuJne zEQR}koAt(6?E9jm(N0d2T(_~Zv+hJrekX&$F8!YIyXn~8LSDN(OAPFi68F!tm$WM% z$+7x_#Df0>E&}dLk!GUgMHXW=p17F}x~#?JuLTBd8<>+^Z0|fzVdYp9FFbyXT9Yjr{+f8;zb7)254tiGaBxe<+$oyKg;Xf@Tvx zXvzAI(t%`&7C3SV1f9NmQ6i4j0wURCHT94q&O{)%s^cHb=V3-mqJHwHeUB}u&EmTuWBR;L_5#@mQ5l|{pNo4e8wu%)Qma(VADtz2v_XVhs?tn1>i@spAx-WH=D~QjK z_|}4j1Fe57aI26XrKB5(4V6HRvHR5d_fG)iItWvZ4_~$V=mj#1(w~a}b;8*6F1^3y zxjW49&uPKmNvM&$r4z}$oGxN*oFXs;Mt85%J^Bc zv_R|`TgOo^?nxlvV`8K6VThga5V|;^w$l`;0?iHDij1!KyrGlrzEaUG?qNCX{hX^w zefgS+&bp)-Tl}@h3HD;JDHfa}r>fAF=Cz&xpkL%io}!;+wf+=f$IfX1XoDm3u~#0mq*(Nj`WY!Go68{~Q)15DJL8w_0}T^W?2`K< z{F2{V%)Gt<{rzGvqa|Ln8P^~hf@c#U5z_Sl=ss&{s6!34&%7If$2;G@Y(FvOIso;0 zr0mlc0Ls|vNNvF-CUp`P372ri7AEE#%7)F0lfjH0plgZ44f{9E?N< z{+1fqmRh@=lg@Vye_zPtEoNV=HhjQW25JBT@zK#Ng4Gtwoh2oVes?|7#6pNc7*Kj? z7gKGWBRaY_eQV`UI0Pv7DX5&kJY0f5n0;a```Oj0k-cz{StR(g7&WZI*(R$p;IT)*f-LwGpKwSD; zrNBxak$`>H-0+YHUIrMQXq;N)n<$02(IL)1VE%=0Qc?;muN^{599oTCxZsbe#WuS9 zap7g}do+L_WldBTjj+k$v&8|Ug={%FIh!PE2eFTBV6U;G)p{Pl{Xkf#c6n`9)kK$U z2~6qG2(+4H`R~!@f$hcs#Di65WYy5VngSeUsf8WbuCtHX0+(rGHeNDID7G2)VAEh* zYpx}?*^GZBVHB7==K~7|P{yeZaKe@^X_bffd@MBe1wU(WLzb+U4Kme4F34>W8}LjOeYa=tMG3qzx$AnMU8hZVi3@B*te9F zlne)~gS#ae)s~z(Y@E*~l`v;5=3%~$ed^v-UM=YSWZwbjne6QBvjMUvgeW^}9XLNj zPJ6}C4-k!`QvrtAgRR5(p!;0H&_eOAk?@}YDn{*pk{zfZ27N=yNNH?oF~(rz&v=fM zjDshykAZ?tCk^+XiIVY8Do0vN&I?-3!;L61!8FQKPJD)TqU+2l=ZW&S=tFen&nx3T zgJ)j-yo_9{=(vaSfo5#ebLS4Gis?nn{gYAc?d^PG>4UDQ*O8riuoDJAg*Ot?Ng2(p|9pFr9waWLc@lnl7?W)vr`iFzg|PZeHe26Z5PSh*`I$BlNu>dAWMz3N zLZ6ynx*whm6`b{VF;gt=#i!fn`{pVUbeVprqL!sLb)r^zXcinxvFFA9yT1P!?kjJJ zuRP>)zyVBE$7(Q)_#J);cl)jtJnGnt*B*;6%2~q|YJ&py&>k^7X%1-z?J#g`(HMYd zsnZq!9rxal)2fTHbe@F@cTk?Ae8S1Y{ck;qN*7pM!Ls-qYXIvM<@|MMNSQdlMd_T! zmrMR5&LMhMOhk_gnvPIwrv1Pv66N*eiuCheSuP_WUBI%qh3ArLdOo8v z0@SSF#G}s8z$95c*kyTTwt+v;!PT8ClE>^VWPmB~dMA4n$RlWmWZ!Q{@ai0Df$L5D z{Qie@&^c#=6jY5e@_(E{4dxCgNa8EZ>Y-j)_G8jY`5w_@5=iI^0fe>*pu(={fE*dC z1hPCW`M?*AKTIw29x31F$~YnoP3r!+kctKfEjcNvsin%ao6Ko&AvJR~8CLLZr=Z!$ z835I0%GmYB%`JbXT_;c8BC@jJ1#p>CGkY#TGcS+3#lnJ}uSW1s$f}#Cet#|Z?e9`0 zQ{_{~b!%&f@rV5HwCulD%hBNW%BUF;Z6AWx!}}S~nSD)jgtUgP{e*+Q8ihc0fCmdj zTho`6A&F7qz$*3(eX2KrJW3WO$Q$+ZjkT}V#m=;{+K*MyYrv&aC$I%pTf>FO_|{Yw zpRwekki|;()hOB<6S{p&Yrb6cji3L)!)Z~fO@yDE)~|HNszDYBQ{X?gJx{_%*{OQD zX0ku~HuDT;@E7r|)Z6nudd|p8Y{FFX5VMcoj@h9Kux?AFdzZ-8BoZgX3=u!YFkG$s z`LQmurYwA~*v0NzhK{+4lg5+QyMtU@j&6qzrD3S$zR3bY4rvHGWZ3zdsHF?MkhdEv z0H8o^EoGVTFlySocOGDVBSJ;Fd@uU>8505bzxm+5n-+4_1sASNFxpX#6v&?BdrWdU z@M7{>^c0iwzUCSi3{flH71G`0@r>x=2J%Pg(^otJ|p_F!7V1;3mB;9a zvfqQLP$=$fr=cSE;i#tVfGCjNIFmw6W4Y}9@e z%V(Pmh-Z~l79W$wLpj13P@TeVT^HV$(nrJX)LATI%e2rK7r}Uzji6`mi*FLlhOqs? zaXkOU12k+*RT#PV<04J4{hHBsBC{#8I*?rr^j{xk9n=s`gS{6Qt|^Z4Tbpz;I5Rtw z;EP4dKC8IUd6`F0$Bc_q?)$#vBHR>#LgWfLyTlrjOr@LlG+ItBasActt zQK<>SQ{?bf9@qoOQQFh|4s!x&_(9KaibKWd>_VY2N}bJw4DKfuYRt-cE3i!$Glc=*2E2H zVEOrRlF%eevklyg&$AxU+! zh58E>x(xC{Msp(hCu_03w!>x1DEs*&^X z*(bT^mQ^A~B(Q`erW?ugHGP$O3O$@+MWXC@89gm0bsx18Z{g3RY4;(QW!2_9j}S&5$~{n6pg}*Iw9jD43<)(AkyNkHQ*o$IyAHWj?bL* zH^7uqpoq24@G$XN#PE2R|7d@t|M;(@ROp46rH+z%a8IAr-flr7W7F$FivIf7W4$3{ zWfKgA=}djysL`Cm>)k->pRpA~4+qsTLj%|29n7SNS{;pwB!Nd$8p!>HZ#+g>9;+t^ z9xKUH#`#PQ%5?66DmCO6HLOOVA&EyRkydCgfXs>=a?cp;lJXDOC3^eGn$>UM1OCZo z4uhW88JqT;XLykOdn#b@g{LQVPg>W=Q6uVTY=n720mZBIQUvP2L47wSC<%_hr~6MD z2J-OE85;bR3@WuxmgJ$A{qd~!jo+HW#y{Vhi4n_mu$a*=-N+H*3C}51eMNK{&lUVfC~-5s;vk; zSo)@(+0;-QeLK|ZKMq;D{F04B4PfQZ@veXWPH)4d(WKs>K#sv-GQPk0&~7}pvT~XH zjFQJvrty-LRGAnN8#O&H$t1o z=Rv~nB|n1F(sCp6UpYqAg*pt9+5was4uB2%1`7#0E;>31^+{O?!&nnd_!D0YxgmpU z?)U`~_$QstdZGIJ>%NPPj%T3jHsVw*X|#r(=$u;0Hg=wJeaAh~Ay=-)mLQTZ#+)u4qckpLX3{=&1tf z{u0|luXh8vs?>c5%AcV>$?Y&!g~=O=jt;^f;{Y^6wX9Hh41%*`$njs4vqa;7ddf`R zW`E}uk2~w0ZZM0vU-8gLc&2g1xDpOn6&1}G3^T1vHe#P2%%#5(9NB37L*a!>f&3Lz z&Is`v6a_35D_=tS3wk!TDh<|XwVFThc;NS95nvK=_1%Au(;tjRN*@xB_(b%aV&`kc zW^j7P0&fepj*QhJOg#(*0;yl_zre`XQ;b8!x`|96>=%7fdeawa<=pTU+eLM1`R6Dr zHaur;AK{Fj9iN1|44tF6J)2Ks)!&XJvq;)IIKrfT{-co(7YC=rzTA_xhQpB8z5gyP z@Z#N+K{@wd;}1%imd_Iy6SNDYI6L5^sp!JF9Q}^wB2Px$rTPOF$^`lb>;!$LWL@8# z0POU8v*B)f|=pjBVw@)qIF)hBd7|mY%$Z@00CjbrLAGK4=R_mnkniBSC^o8WnMxP=2TW817A3}nJ#)gVFZG>wUrsb` zHrxjssBsPNg$>qs)f#)qqOQdtTdA;j3&kcu^nRc4=r&Q`&M_}RN_z%1#Af(J9BeuC!-A1v6vYX?k^Ejx~8wy1M3h;FZUO zOafAAb)ihHx6JP*bsS`ks!a^_bAYO^~cDw>rZ2magu4XJa;ieKt`TdLLwpz*G=l zuRrm_YJHxZW(5+r07sz?Qg?S%d}lqikwbxcHqm~<=DN__VfM0q_4FS#W7JCNi!bdzLGqF}Kp@xY! zv8*rKw8N{tkOMw%k~NA58HsU+91kk$rdI?}C)5Nro;A55m09Oj&4Gzp@O^rowrdv`6k4K)&eNlJoG^9_G^!*zoaQx-v zv&wH^I3W>g+UOt)TojUA1J`37ZU_JUT`+JF)*t)omEd;!1Ie?cHS>75^Fcxc zTjIy)dzI&!9$vUl{NHsscWughRwPf(=;!~=(hb}DngS%Zjx9pJW8BhOb)z;sj>m-S zHv7wux6mBpeL@~S z3CwPeCCBqO_96Hhw`_zG9DZjFx|xf+Z7G{eOHr!(i1T5rRC`OViBkvoR? z?Yl=`BD%FF_w-G{oXv5)KSuB8JaG@s`Zbv78rYX+aGFj5nOGomsfU?c7y6|mA_8s4 zHIB|Gu+h3oc$g}1*hGx%p@4QKn@k!*Hyun`172SEuo8bI5`jgHwf+}_R}Xugj(jzW z*B%p}Iw?Y|Y^4v36_v@6kfJ6-2Zy9!j8qc*vV^`Vt@k`~=_P*sjz#i|46E>H4iZF& zHMu@3=9(1?6M3{3YhAa!{iT>$_3;*(`*OHL)L_-R2kiYdn87E%=h%HrqDIS9M=qdQ zRFDv$$qSncgsdFBfDkjD;?UW@KDA+f#CB~>`+Bnf2$*mOCfIu*)!H~(Xx@v zpi&}e7N{NQ8RMzGUYH@Zy;}Vc16?=M?hk%V>Ft1g4jF2duV1g=#e#hSkzHX$t>QDz zjA%qjxCAMRaxWpf)l7vhD8`hrpT>Q^uQDv7HaI`iV0k3<1z_nm7@0rl(qXh&{W$43 z9T$(FV5)7wC*e1kkdN2{W_wW36>qAjY(*LAk!E1ZEwsWqj-Upvnp;p3PiaE9>-NPzA1QSv= z$Irvzrq{@eYD5m(=nfdazq|-=H>z%wftF{xvvF`&E6(8r{LeLwimUP@o7L4AiPjpG zO*Th?{mRl(DIVq(coC#hBge@wx$)_-yl8~w-=(g<3=dA9dOHcvkcFeG|Jc5KP$CR` z!O2jWeCwmEKK=%R4jR@G^%6_!Sv3Tn6z@sM;i2@=Z`K`Q4=CT0z-M`Q{^{H@BudB~ zx#SBv--ld@OGjRgx4j`+X>`1kzAmffhC{8x0#*s zVU41LJ0ivz50zn3a4_fuNHM9WP;m0ii402JK7|2FDTIGKq!_kdv%n!BcnFKKoJ7CG zgP|P&3KaL(bm_^{l{za_JQ}eXce;%gH0-$Vl3o!)2b#l(qNhpuJYyVT5=7RbaJt22 zBZt0Xmqzp`WF1(@ci)XyClbeI-5eC%uq{mWVG0q-!92HfP z+&51;xU;<$GTTG-enaaZwy{t&7cuz4NDy?Go9tW6p~5{I>Or0d{H@43_WeGuOtZdp zx`TPBn~BfVP-I4j{gj?z)dZR-nB1*BuMYL$i>3~g_!!qSYK*OAqn>kooS5jtiBW6k z8_Vn(Q0r2@FO{Dg9(+1h=@{H`qafy(0WbD_2*yV`4y2l|0;_1ZYh6CoSZ@$ zPA1*X&0D%M&x0f!aN$xZ`+Lr`)K*nPGf#MY;u{lIZikLaYUh|Y8+CZw)fY)~^nhPV zQK1{G`eiD5Urol4lHQR^g!*)_68OTZ;M-j7nX>OFsj!TEmx^IcJq@(Tzd0zP?FUSa zRbNtwfO%WXdq>Y05!jqHmkW?q`{=!6&DZt*-UM1H*BDFpoij&bF@=MSFrXn6%yPJ+bo4DLxnCbI&u$&)k4*Zz7St3&O%8m~if^gJB zKJ^G~Lb}SM2fYG%%_FZu+JM|*^+V>4ikuf2O)Iqw?RWPB&r6S5P8%`|PCY;m*H?or zh3z%fv9=uA2-;e)5G=R*HCZJcvU_T+ruGYTLVyrV%#O`JdE+Z?7b2QP9ceI-u+s|E0S&TcE@43Pa%*6=5+C46{9 zaH|n25>`KuV92PX1P(66G-YRF9%JbM_++$;_Y@k#^Pn(6EBet0g}Pc#Z`z^zD4=vY zrr1Qx=f}gDlPJ3cw)|0BG>L!Kd(F-?!PZwQ11~ws!T~bLx81x5BZ+)Oz;WZE+gT%- zt((`ksi*C8I;WvVT<7UuS~f)ukvAQ1Kf7RwaFZs78s}qwKZ!ch$(0i286ZJrY8HlSfa${xxY#cb?mFdSiV3KYh(dBrMYjqJn4sNa9dxo_PNLxw2WG5v^RxJKr?B>l|tms27NTrr#12aKAI+Yym#1TIO;PukA_|dvp&2yeS=P7 z&X5v=Kqs`|L!krx1nL`-Y#BVYbr5D1R2Lvo2<8ecb$a>y<0~XRDfRn%FrZhql3NZ<)f1E6VEm zWOeDokx~w-S$2hYDo!9$Bn`?MFf5tAoO372Ae7NSLE9D(+PEA1Hn`(wDh_R;p;-NY z=V6rtoQLQ4r!Ho_yt+b#f&&)xr!!`ybyeK!Bp-_sFz7f!X92a%f<9qmV*^4t6asDf zzJ67RCF@CypC`B*grhqj5C8Gr#u}WT?^^8tk0v z6M2w^jZnVGmP0{2cE9G^deR~jnM6U{gV%8z_^H-k@T8Wx%n_$r+Bi9h{^KJZ8L7X4 zDy-Ku3rgBtIWY+B19K^1aMQaRL2tRAAK-qgu-M>MAWuc{&^I~#w9CtZ#!Q4^H3qu2 zLOZGmhaee!kg19Nf))8taY*qUw*I7jE$o)xx4Ra$jEbzykT6?8hBbB7|EX(d`%|hl zqlB_M_g5Aj-{Y@u&2wH0II(=4-N|bj@2_u2`r+RQN8;Y({~5RKZ9=rQNt&9KEBB|< zu)^+{TNT-sX6PRalh`$ZEub?Rc*<@5W~^A5_2*%#LGBaFj|yv%)Z|p^^qQ}Snc7oo z%W_M@-nrlqguI43%}HDS#6E;wZ5gsJD34NZMQe$ zgxwTy6WC1?Pp_e_w>Z>OY_`zDC~7L5$}IJ6!=9z$8ROR$ebm`DA{QTMQ))Se^!=5fc+{10brf z-JzHnhLHR>h!G|}l!I8ArzUK+E{h6^Mqc~k==5#+O`23V zU27O@BcDsXy51FaxA<1gHCJ;aenRZmfyJa@*ZPMn3uyp{^-65Y$+M-dB}uPgiuD6e zV*6WP-KK7L90Dw1#z*CXzq}_$^8Q{D<`LGXQCrOptX{oVJpD8@G-UG&Q+ei-EihCF z1CPQrpNb{*5^f5BaOFGG%1nzF`s25k3viDvHqB9m)+JNHKGc)!BWfd7Qp^O2gx zGqH}ZM%CuZso#Co53Oze?hQ$*5v&OJ(%X6!#fOwbsSobO5rFAzW2_yYZl-FBrHM#+ ztVb}G6U22t5sqW60uV4pPTKKFEhwohQGl|JB~-$0eK9paBVJ+h5}#zQvC=au z^AQwQaAX3rpgT4c2hjhb5eO7v$)70ZAN^Gc+XgldH?(<+zHI*7e@OIbifwf4mczu~ zoe8kL-nL?{21&FmAUT;gM9P0cl)k!`U!alm!F75`iW560&L>NhUpfO#WTJ2?jJ(_m zO%1%Rhogk|cQz8)CeRIFRd&GWO`T66#nvtY!&uSfV4#44&y+AxAgv8yFn~q9MM?gzS3uJw)qLG$fQAK1H{#sG6wC34I9YNXKFplMi zCmi!#5jFQW=V#+MotsP6IA7`&YKNF#H`(-x+5s2JOG$}@sb~HlLf`#Z(v~i}dA8@^ zhys>`9Qhu&6kSl#P7Zoe-2mNaik+;4O$z>tSJ|IB9R33M*{BOT&(~SKUoL(AdSU*9 z8(kBTNi@8VEOxuxXZcxXc3OMedw6%xaqjnN5G??1U^rEHN$DjU)hzlsX*Ys)y+L|s0&ceL;9xhn*Hr7)AVc1td?-8q(-A7k122Prg))Eh4!a&bZSFPqyNkft;cj+?16WQVLy zemn=X)g(|Ysxm0W7an%~weXVC+RlbT#_bDm5+RCSdk(;R8Z<`*jg<>6e|~K1Ab})6Kz_L&}0G z?K?~zwMSmuTyWoY&TjN>Y=0Wxuz^b|cM{}E`XpHRd*SR8;X7X#GY}Q8?HgIVQNr8( zmYR18jnZ&Hre@UXKdFOsD^Ie61aY?C}v~g z>lxz`iPUO3znt5_js&?ocE|8fDEZHi3TY!Anq>Op`0P?H3P7AbC@oB;^Gt$ z)enyY@Hg|P+G`Lv6Ywn|@o-PC^E=)tcgo!Ufq!pG)BFE<)Af8wZzlSlqR+?Gk2D>q z2ugr0D+^-U?j$buP1ZLzABVhr9T6aHKN%N@4<2XL5vzsD12)wNB81e^1#8}@guT9x z#LQCxK{CprhUY=*nA7-+X-4YSzLe5I)4s8|abHrF4g_KqlwZgqS1%t?RhzXX;8RPv zteU2U$t!3?*}bCDFQ8p^lqpytAtB$MlzE}n(Qfk}B57}a`hVfc8S}tc$plc00oGo9 zi1LgTI8Yf10HeB_uU&r(m441=P)glwGrR_tL4E-N_ZY?I*@U4?wd?QyvV~XKuu5u7 zm0RUJM~axGV9wtIxJhG?y6iJKi`hD*44kj9SiQC;{g>O_0e8F|wG}d;T1|#tH(ty5 zxi6{fpMW^cJ6+B?HqCe8RNh%5( z)6`R3TDkyGr0|B!9JjzA4#=TNR+%)h<%xKNRT?4?dCb ze?*ki-{8iB7@~YAHDNMSz32wHIX{nn%>4RZaLe#uyOEr_eBC;hN+`ZP@i#Q)wj-c!#FN863B-D?nI-0)T838haJK8zS7xh{RkgFAMjmAXs-WO!Xhlrx3ka53xll%4u{$8`Rk^}W`?COna zYj^7V!I#UMOx9E_C*lcDAG+Dg0ss*3Kot5&era*l&>L#WWt4H&d*Eug9l5p3S-8R<0lPW4Jx?BSa{2A%wli_gN} z0fk!=d!;?04pN7g(6vj zU*5)8GecPB^gkezH^LrU|EjNK61*bRrPt;1*~XfeGEC+c6wqmA z@r0IX7d$nhnByaj9xj1E#`{aH4H|mfYv^R~Gu$sx)@u|m=^mE|+rpDc{qXgN{0i*p zkLFC>zCeemzz#{)sXS$@y!vW*Mh-Ezz#xpCvvm}Y`pmcbS&ONJ{F z-z`gKwf*XDocO(NfPt%d0?Q=xbVG zwE&e@o;Bsg{Kmc?v=#S_Us)Xd;a&_lK>EDEz*t7@Qg2m%_UrQojTtT(AKY94&c-&t`@-Ga+;W_Nf*tO&{l)Zr0NH*?G~}QIbnnx0L&)l@G4v=( zE?PV`Tt1KNER!Rv)NTIw@md1a!M+b+&I?to)E{it=RM{Z_20`~{3y#Wjt2@MZ{E4& z#2ufUmO9ah>8ae^{0eg6_fKb+e{TK2_Vyx2UR$h1S3dUFQY6>@vU29-e(8~rYo$}% z5B%!sdpP#%^Y{FdCyL(F>zEm9kP(;>dFp*FMV8w|PYpJdekrS)W~7n*kdO8Em|^ct zQziJ)&9YVBzbr#nV?@6>9&3lx6q6HNVDW|DPv z*Axa!1GxnY?EI3C|Gg?W!RVNtE;SPqLmIH z!BGTYcHzVWm_@}g2m!QbTxKmg{;G!y4MQ&9(op|cvz?nicmo(9U0Ubp^#E?hg5~Zm zfsXx?6NDSuxj832l?@L1Jui!-!LakDjDhuI9@%ljdyIK{H3R*DbyM^qdnjLxPf`X9 z0%y-G^JxoNLXXG)G=||sP#fLm`6OqOi`WGrMz{GqTqx+IMzC5aQH=Ocwh#TvcXk)1 zCZ=zScXgWXuYD92K9rNndmNwz8#XUnmpeVg{geEHWV)PLv9YQufQk(fdZ0kv)n+SJ zYFDQ<@pd7!tiL}(X zpocFuJy=s*H;k;Ru!EDAw^@WW3j*TREP2J((2(P1Apbkyu(_aCx)womShR^2C5 z;{J2;+P#oF!2FwGMYaYAothwY3V5aYK#NxXj}Rz{osFX+vTZxgoN(?WoT)DgEr!?^ z>^CAm&m2cY12RsyqeC1r2WEzXC3HGaZb1_V?fR>XOykX_enKd@4`aft0>d0Gzha#T zugn$$Mul4Pt~I=dd~w&XD$Q3UR-nd7B=JEJdhqk>M84>({QdaO4gz!qbAraI)t9yw z{*+r~_QTHaSbkQu490#M{i<#DWVZkBu3Vx}M*FiXO}*WIKP#I2@`2VjQdSicMlg;? z*X6-8j?4HTb?q`b&!d0W*mb?wUBw?d`s(%Qn)X^~3M_Q4zDYKuKYi%tEZQ z92>*&5Pw?zom}Ok*^UoFivJH&=iryw8@2nXsmZo&+fA+~+nC92vL@TM?PhY5Cu_28 z+wSas?>X=1`~mIuvu~~YT5El;deiZ3<=lnlkB^PT*O}LIC~3{#6TbsMpi-9l>kBP9 zeMHO{sg~7(=VXzZe0T(1K=Pi5xx;F_}bh^SdsnFu?WO zCpEiyefrN7E_RkwATvz*@wh_+H9KzDMJC!u06I!CbcUe z2944=xV=cUh|PT9Px0>O5SsU`+l>}5j@PosX}Ospw^a#< z6#(_^Wld1U_f`>QrWxO+r>oO)-roxBsW$Z|8?ddz6Y(^%ZXBWtTB9Y#c5V#5_P7#V z05%-}OpgI2NeTkD=pV!^>o{j$&MP2OU;(t=I2=mFY+qdbIHDAU0Ia^M$-s40O(b$_ z(FWYH7@=*Jt5knn#-FD65kHhS+KPVvmpe#XW{T-ZbL*j+)&Yq}uLi25U1R zgc_-VNRUqlg=9Ww*@ymQdAg)5PsXFrPAG&D7*LzJIZh$|DR!o@R23O>xuxW!&8fs-E zqwTIf`Y=e{(Asw`H`LkUU8f_1p%23e41*wbTa+2hFC)*3kHbsfBuU)V)Z$Dhq(yRSzDj$afMkUa0;n zhkHF3#vbORpXJ|Rd z{wl*mo>(9?HVnjJjFBI4m`k(>{@0Jj7XXN~pvUTeB24EFm<+K4?WC9hP5UEX)Vnpy2YqsI^mi85zGQTk;w!~qF6(ZcrvHSBCY1)P)=AiNpG_WLA( z495}8f&S=2vfDxxK(m7-7RY0bo#|p1a|?j#?;DBndyz>wXd@oD4$^3sqa>HGbfUu{ z|4lmKKGnU5X$xVqPIb#-5Gl*Oy`Bdf(pPAX1H&xphi#Y=}MB`&xIClPiO zkRl=d$JN-tF_8yR9nh&v-{pdM#8s=Dxn~JcnGEUORL&mx2havYS~XDzBu=e3YfvEj zk`M4IkcwQt$*6W5#SiC8OJW9G9Zq97{maBT9 zMvyl-F~3S0270-I2`~@vq!J)%@V2mIiEJ`JAYLni7>uRUG zuWC4&%>7>>gGA0DPFcOWzJa^qFp9&p%JeshaO{IuIZ2zJy*y2YI_O+kWwBkoz7WBo1tkqDJHr() zL<=&7@e5d0lrdPEBvq`(%|&$*lcr@7WFQBx5ij!9f3b7b^O+doX6bp!gKV7__x@oz zS=!e=FmN%I?H%$C199v&0VXQix%C^9-Vg^-*QD|QSA?a6(Q;J&wFc7^yfESNal5sy zf?!gxxzj`St}{7*LEi~S1Fqw!gaspEmP{q=4+*hWEjXyJ~}gy`${+nF}-*Pvi~ z9axm?Mzx=9CLV6T_Vl=!1nh-xgnBTiyS@6}Y2T#=gtov~vHM7?48)wTVk(uwt;f*U z+ENdZsr^SbI}1ireHl}zy7`A}2dvSiE_Hn_po`Y~fSDF^-)N*Okl|L$1?%sauFdPZP*6yijM|WLIIJdX>pt>%){(NJqf*W-oA~yMm@>6&6OXeIa zcysmlGOPLcl!w#mwQ7P#2&fS+AZugjODqb((IzCW@oQKZ9RAy*S_R`Xw#-c!d<(TP zi1B<`Ln4`6Y_LWQ+Rm8s6#Dl-jY}mr1=$8HGd{w9lboU*Zjr%_wQzuc7~TOT z-d4O&3Y5|IokH)oCsRf0HNLK9&Lje0c7n)U#JF|x_$8#m2dV5V?O&NGlSR_mlVxIg zi5aK8m#IN}xx5WDs^-rke3++QGR#xl{@fCeuw{u9>^~jjQE3dM@X+^@5NpAP5PUY3 za946W^8QVL$-g@XY@%1dveLZ-;~h+`wNvaQZiYtDZRz%;bCr}EN_+{Gb@MbM$pe}+9H2Y1oM0ryndNh~ho_I~0G|~HIN?2rPRDo{zMnlLNUi|(@E(H-kccj%d zW6;xincvybx<*rIJfq~mz_c8i5YRv{zI$*E;gKnk8))NB8Hzo}Ka`Kd{ zIN=LW8j+b=i|7tbpJhQD9RiOd@<9bqVIb#fgjNua9sKC@j*L7e+rFR1dKJ03+fK|EbUldQ~pLdj2)dSTJg|t z?r$P<;N%LJZ91E$G{e3pqPKVN$W#6+H<;w*Yc6sJoCZssF|0^LQ}lEwtM@J-%aTeh zN3ZlYa~(jG7(&Y4>5E`bG5INYkyuIM-|ENCggV{Dbhf6pb!j7piMOxPSX3!qF;}Y~ z6lzs-O{!^;e9_C!`PrVn+HYh_d*Yv1>sBx##@ZBEVw-Wz1?Tv5mHD>6<;@9+KYBl& z;3wktc&{V$U6|LypPL8<)L*!&_0ZNXw(f_i+RAUl5W^#ieA8L3$j}R49|@KZEgfp@ zCruiKH-PTJ#3GDm)}RZ-IzZw#ko;pd6944_JB1CWT!V~>#f?JuvdEPM+UVx80o0*Id(qoC|qfQ+PXU8vE<50*{Vte3lb>;uLK>D&wWU_CiZHnG5u z>+C`Y2hCBjR9#_>EcF}tn_KXd4--Mtr-7q2oeWa#6bRlvm+vkl6m5>0ZIT!f>yAE_ zPi|x^<|mJZEMov*z^HFAuWiRZ=F4iCL682<>8>2q6ykJnC)e~k*4r0)Ooj=rP~JI9 zh4y3D0^+=k(T@(sBs4(tCs;W53^H2?L2ne^rSw`x!i=jQMuY$^hfiGR0ZCZ! zZ$fj&MR%Ps^vpe=NlBx+55-eJ$f@<&q6T70?RcL32_fyBG5yeP0rl?&=)z3pSJK_a z{-HQJ9cC2kA>GBbn>jmjwDks!5kLp+meO&^^<0Dl5eM|6&~1KSR!02^X?StiHlZ9M zSk)BEUOHJ&BI>UpEI}p;Ax78c^qQ@alT+q={*48jk{nY=y6EL5fdmxmHa64zXxeIw zm?$EID|!D%PqyZlgocW;*r(rB=?z6mtcgWFPVC~aQ3gaTZf0W_`TEqq1?j5YRraXM zU?F}*;4u5v21(fT+hxeARgDNrd5PVgh<6z7!5FJBo-WHQQprmzPijM6Fe6#H1o-)m zqpy`?LHJoS2pMy*7*WW&jno~6e5zI%HRi5!tMLv>r-r#VcoFDHyW#SSD%1ODKnS`A zhCafVEzBtQ$F3V?R;FV*G3fi4$Ptvv>Sn`WXZdZK5cRh9BnR)+BFwO$9%`f}4c3k$ zn$-6hc=Txa`GW`DJU^d{@L2Y%;$k%1I3Et1UAo;r*HgfNZ&bw&Kut$a_y9cB$MEHjM+Dxl1BgIL=pP{=l?-24v6N8+f;6%!aKPKGSDKYjH$P}x zgIYrGZUAHN!`DUmYu@7_^thx(kN>g?rr$6sKGMBYO{#b-ecR=~G*$*8k5=IBzyhJl zmZCrMhK%yUkS{tAZx(*REx%ryR!vEX!p~Cx;NgJ#Km&RK&A*RAjmGZ*KV<$?`;<@^ zeOu*4pS@MzETV6#cz+s+E%HcVL|^(%l7+_m*S#z`z!`_NOasE5a8)w1Ak`(u>j~Y& z8uK$cr=ZSk-**CX*BcwJLYH5cEGEzXvAKij%IL+iFws<>Uk|bInZ!nqLJwBb7@_Ts zMn6Nzsp*}0b@=~`Ia}57;P^Za$LEq5M>MGnGk7h4 z{@-}=5xMW6d;q>q7;%)Ex$M?tC(v9dmYD`~KRASK@@S(6wsSzP`hy+yVWc!8?^1N< zsDmFNWI__{w>6tsm)JN~KZgzEj-HT^pPYjdYn9EnO;he(s1=w3n=EA1bl90lVj%6) z83#!RA>1GzR8BTXb||t5c5lDl46^8r zDzcb0%Ui8_#r)}MTbGAmJbRQG+&gJvYr3;WF`N?$P12HHlDR%f1R{Bt_p3#BLU2H1 ztsB=ZnwE)2h(}LG7QblI?%ATwm*bEbf*)X2Fa?P$X~On=or<|)l{QqJ!NrtO%DQpn zW>!7hqRSS-rxUY{`$5W+>K!O=AfI0-GsJqJR}+gBAZaga&fXU?Fpa>Gu4t_2jCGD- zOZah-`5>GeugCyRDrBqn0|dd!n>_qKHiRsvi=$>Bf*YCno$|;s>58FpcpN3Zl!|_9 z0Z>%QK=TXLwTR3EV7#Ubx2UKIS*eRWy$L8ZGJvU7ai=O_ptQ;lMxZ?=33n_ljY1gn z2feym@`pu>2PvI}`{5$ADqygmwwV6;Nk(qViR3z#41D;|((p+Wki(U$ z!CHnUc-6t1Xx2LJ=0dQP5?x((6;$08Wu0v4g60oM*d4_ct9MLfWFQkN80u?^P6FHn zmn;Rp=b`ich@A-&Y&HB-Eyll;KWJXM3`q~$cYRWsbf=D${Wot*3*r1cvs0lg;m4=Sj*47~ze+(K=45=zBBLjL||@!@_aX)Usjd({MG zqbD($!%#=Osi2h<3#6UYUv-)Cvo+ci-HlpO8D!68$C%Rcj@PO@~9!(=qII7Mo z4YYpRsK_u_$nI`?{g)!f{)o0%NOxDxNQ;f_);qF*iUEqx;~y?^j$y>{2A-!JfbU%S zXq|-H>n*?o0qX5g1pyXiU}HDi!omgFXHq=v#@K&Ct&s@57eG?%U^9fM7;v&i1^?8I z=RTZB<0OS4brgUkm}{2p-p|cv=Sd8n6vjRF^h}<$UfUqc^c3NlS4=#S`!}P}RxfSa z<3zFEInNp1YiZ?%UHVna4H_<@rHTtOyvyWH+g(60=_vtow3E?`7<_y7&$;06KdF+L z*{{nh?BaV{ZLunYydE=qFWRic8~U+IzEdYEhCe>VV`IurlBwVO1?{9G;taad;Vz}a zvPl<0a3|EFTHUq_0(X3fzh5;`yB#hzhj0DFoY@LfT+FF~?r6o~KiM755!i)Kt(ird znLZ(uDo{&^%hoQl!}s_u}%5fmB;Tk)-(aP5Gs@b!WFrnmLK;f z|Nm0cCFBt_+r*LI0kTrTSU283Lcp{=@kUT$ziWCP<;(c~9jVCE)9rl8vYDCL^oNOD zj`tJIDdtjkG5lKJM^PbAohLnRvCR%$9P$Ev1yyAxo3Tz@uiWZEQY_I zi4j*ZpOO_%L{okJjbuLl>DXgfz;W)o0RHN)UpHr?0y!iNQR5NcQnUc+KG*F{Y>`>B zh#i>-J9bV=pWZ3MW`;w3xQn&il?gU7tf)|2WNXt>e~2WA8`DNkuR%L&<){UI(0N}^ zl|Gwp2>K0@$qEYFe|MH0x3HDGK8h*B9kDzEd9^d=B)}0sm9Bkdlb9K>(Tizopo5$* zlQ9c)ZD8<&sha&Px98?ZadM*2?Z_wd6$Yc|3>jsIBVUCoPm}@tIqXw(F}?cepkJG$ zJB6BTlAhK*kAtFkltsfGftX@%@sVDe?FXU#->05d955{LE}UL_bwfmg#|JkW2;@({ zF3E03UMNi|b|)^G6v8(eiq<_}DxQ_EL2@I7OlqYIU7o9DK_I8^^Ay{n5Qhji^bt1Dyk1lYON6Iq%jqRRK>u=!G?Hb$9TqXM&H-CI*MCM%!+%gQqMZ>pL5qM zG@L8e$_~%wOKh`RJ#H<<)BOq})94%Kxec2D4B7!ui3Yli5qriA^NXdVNS*5fYWaFbsv^FW&APXQuLa!A9 z%5QD0AGj16zYAyPn3?`H}oEs~{e&`F4 zLOkMIDT-npb#fp@koKG*0U2?x{Qx!yK{%I<8Btt^Ul>@`a1@+FCskQE2f3Os=pvPRZaxil;;qK}z|+;v?b#iN%kkNW zBzo05eD@0^m~a;e8rr^+`OP(uh4Qn-hQg<1UK`?I(8-)P9I6sINn;x=dtlgrm6 zfbcnnuw}^EIoKzVLdnrla5O=M-)_3Mmb!r zx*`%*YRp5ov5IOFmqeT`wYMV6=DRAsLxzi9bf5 zeVmA$yhOzw@8O5RgZROWN`eNW>OxcO5_*OpSCPJ?@ip>@F&iJXfKcG|GZhBH?hXx1 zV%r>%YQY6*>{MgvvVKmfleze9ThIVHXXKzDW9 zwl!2wmhDorke4Wsc?==KrMBCy22n)+)FH-G=dUl1$Dr1|<_@YH_{8ucQd}j6@##bo z#j|4igqy}!{ODDUGK&smr2IoTOB$w$Kw>zn&I3gO%gv5{nBxt*)REibIBX@PC$Vt_^*QQX$*dvMSfKIdW^ zoX;^i6fBn4*Rf5u?VxF5ZHUm~zE3PR!0-Qsx+$mhI>ze!eN^jh_1`b#fiunmhp%VR zU1`M8_x!G(ZYhe%FE$3AlsaJ7IX4CB53p%t)Y5eA6bhPoe_@6g@WFa{p^*lM6gXXREBmlN&` z&}=n%+_a*SAAWp?eWQCwFtPiRQxRT&AO#g`>hH#>w$CppT$ij5Q$3ValJrY0kRI_2 zHM5#mwyTBt2`25fYJ>irHR$CoX} z4Cr^wW7clG=Y;KO6zbJ>Jd@7XIeT$FE3s)aX+^GOSJs8atPw7>t%cnZMcb|(`N+uIcK%ynq)ru})`f+quM<#UHO04)zs`LC&n#9sEXe1T z$myUALJR3Njd%(rt$eC@ ze*}O7d)7Bpf@a7hX8G~a?szqWZ^z@y_AfJ_a5lt*H* zKfr?_?yU)SOiM~CA|KdBqL_mx6rK#Z!#;*_ykSMFs^mL?S0RhuJ`9N4B~N8Hz0$Pu z1TsB8fEvMVpt24kVbJg_!WRg4j_5IM7yZkP1`<^QD!bD;N|_kdE4WY9ex2 zvbljq!eW}H1@VQfhD>dP#oou^5wWeNRdO~<<@hY|sOE8kt)QKT6pe^GR`EU7JnJOg zpeC#qy?dz4GMfi|v6te}bQ=eqM${&KFz-+tACwnRatMW(bm(A3{()nKQw%7F*Ef_W z`RM`%R=Hm(Db26ju9FQXL4d0Kr`TpP(c$@RvjC&DC7R=;IJB(%mv@^ZI@+X0!Oic% zkcb(Xpj||i4YTOtC2iwh+7_#x6{0c47;Z}9TABHsO53NI#SJ^7*kB|8Q6tt!IQN=N zQAabxYn;fEV}j7uFL{&ByoZcw{#GC1M7_P0i3QdeEL9IEYEakv;{$;6rdE_z4NFYe&g=}8HClDvpWeJq17JzI%C@|6LeRR1lCdNQ&|&lDp<3*C&h#gp>OklaY~bZ)nfEi)jvO z!;0MqdArwH6G(55>u>3})#MT3Ila@j&7oHaHM;#Z;;28ne~4k#+1st5|5?Uj$!LD8 z4c4dqSU$ck7GnHvgUUvi2$+r>*`jLun$CO{Q{>`wZrtK=x8z=ul-2-YXL4pnOq;hs z`l>Y`G0+0Bo@r_Ho=~ z|DzINwB8^NM8b)D`L|xMw?lJVJFOpe>r*fgC9dcYKBWOE-puM}Q+4@SO+^kt9Gt14 z5fpMuDgju;Z?mIWz%qT!5~22dd?riT<*$@-85nBj*5-1A8PexobV_Vv@^e~MGtv$1 z6UP*{FITEN;;_(3i+LKMJVQjxk93Lvq@X z$x)Hi2eWO}h#xX}ufT0#uy6B{9=bh;8`AhtTU`Ae#e{(&KepcsIc{)LjMA0T3eHX= zs#NWqD(4Dj#@N{WO9O6Ib($!WJ$Y|Uh;02z@d#f`jV<}7 z1VrsF5gI37!6k)!Hn>52C(TM@(hjb6z8ko5JO)lw@Rh?(^>cXReABwYN-s6G- z(?+SSnKm=Uc|<kT(x)%mB32%D*Dvw$N-i-8l^89boOD0^)1xr z&bcXY1rCr8bKI(;@)9YL$mC|F+`ot~EeF^TuIG8?GFIT-T0gq)K^0Sj&wF??uN}~* zgUtRO_xbm|=6!u>*?f}yt2j%r_A5B1!u`b1^-*18lJ^tQxzkgqHuKhACL1g458_Wa zjOgbyePJVCE=gTc*^=4K$UHN)NQU>XuDG`ahftgSf>K0_06-5b{o6+F@^dTAz z%Bfq-?iL0)(+0cShIdg5=+L{K4lG2asinJ|KQo-C;`9~Hy0>bfMjBZ>&jYVRQzn`Z zSBL8*cb=9O?m+8{+u`A3FoA#cQ^K9UZ^g&Cj%FI(nTWjr?CvY3=YZ@72TFyP$V)$&t&V^Pdl5i{yiP%grq?TD0-i3063dXvShjdueq$*pHI2U=4RLK&G zH%^DM>iHpP$Fs;_)k!QkC8)uD^&5(yxC!gHn?+0!at5>&cRa3onc(0A`JbBaZ-eV% zN{NR{;!4^rl4=D@Qw>`NKH`0EugQ#yYI`}Uz6%jdN=w>&3JALx)P?3Qo;P)$4?HPc zUfdGTRzqsD_)c)^7@|;16he@^BzJ5Vgznv0+MG+ss%O% zdPAh#ptb~_L#v+IRs#K6^CjQ8-jfb}l8jU;n!d}}&6(ns`$b(Q8X*DBlq&Nf8e}{7 zwVI5<)N1tW7*YGXfvZ({i!K34w4UuSKJJD65*Y9Pp3dKwpya#Wl@R9b;ptZNh8uH6ceR5X0wE-)%EU^N$0Ch^S~(#b6Ck-ePQlXF#f;%if>JztQ- z7kuAfA?D(r?>2TLIgj=gxFg`6gpLUFyaYpsNDwag4Jm^DduNccZXZ25cNLPcYQg+jO4m}i{}l^%h1(`}7#?F!QT zK=DOqbiY2O*08WJ*sx{5nyBRGg|7GG?!z$sll>QQD!@W}^o-A-(_5gr?Mj+G{_X&1 z{R;;`K#Qj`xGk9Ai;0@YZg@L51{rK-(VM~iX+Z0W6Eww5wrjBa^8?&MUa&8sTwUlMCCNf z>APqmpph)1W^RX;pX-F|$qmu4a^wsyMDcBd+BKu(HJMf0=D&c(ZVXGjBz{JU)yJ!L zPxm09cH?QQZ!u?~dsXi)vHdG?(mRL8$4c%u*yYhg6n{{)bAK}vzVX4x9+r!_);Wba zAc^tQIj;DLh01Z6sHg2B6{dmnTaNa6RTDRW%63rzP7K zBUg04+6;BAuP*`Ia5!+3YR8px3(>iInIHd2H6z<78@e1d(<3$Pzl%(pJgNmYZmW<4@tmw$&{XVFgI6NZf=j}w$Ojf>md({HA*YIxeWG^rT2fn zkswCt*UKz~vJ0DFXWE7~iB66t|93zcFE>m{lT_?sbGwF3=OYb{ec$Th=()hNjl0fP z(<-m^`}@?_897YJfA5^5<5?_R*#|%myh5h)A2VSgv^qpa>i}J+`SwL4uF-81>1%z*H%S;7!Ctw{Qu3}5ibovE zh;7E!$iX3O^!Q|}-iyuPP<@#*HUH2don)QTpL^*GHijtnzy=+7Kus4T?mh#2KwH-EwT_S$D32BEpF~TAqB*TNg`xrrC~>s zsEb-LE4Rn&(=n#XT;YXes@5m%N^{dTxl;2V%|QzRqSWzhGsu5WN_{*BiW*g(_wAp) zKn0-nV@$6{CillJ!_m3~>wCgHWer}R}@=3c|U&Wi)P z@q88^&L3i8=w>EhcDA0cl)Ltiw}pVf?Z)Jq%Eje9y*AFiZpUbhJEZ&A(?dH^0<-o^ zw(jy-la`h?G0^dT&GVON@8x46yagPEli}4vx>fb$i~Rn0P7GDH8+c7k&EUw$^hzhU z69e#ayr!>Lsc7YW`c5nqc++_Y`#C@;R85lDOy}?bE#6sw)C&@e z!7<|fR99z8Y9k}nQEd3`T}M}<&=ZU(&dtEip{h!1xBcbda27>H$|&5JQV#EP*pgVq zJ0PN2usz}d_hs?X>;iMXeVdho-|*1KxRTMhMJw3_#<{%nwCa2NmhNV5Y6HnY*)@8* zmvkLT87gf$PyJt%bu<|lWBNzkIIfz!vRO60slSa&ajG}a!292YS|n$n1Irgn6!>&l z8O(ektw$BkZZ9~;@qd;VgQl#n3P*l##Dx&plh{ClrF@)Y^4x0lH1;D5O65CjVE`UOBA^n+TZT$2WCPi75>U0#BucWia4Z^#w8P|@ei86_jh zwZb~QKt9cgeVx{lx0h%(e{5PaZ#uexdi@<5&GG`@{OVRkcKZciKj}gi8AbH-6~E!v zMiW5oRp#>_JT;vXWPG%tirZ*x*avfTa@-kIf<9VX-IUsH1Up#KcsP$IYlV|~j@p&R zZy1~`KoNObI$CcncssE;9}P_8CyeHm&*qSEcXrDAoBf=?Sser{$!ZP*R&8JvU%-ES z%SiXP^{L&5{gLQh$J#Gt5!8NHG795O#pN?75W0lGnPihGl57mH<+vuK0{;PeOAL?W z9yb30P5wEM{Nc)6_UJl|nVbtZ8A(Q&pW0!z=Z}XbPA=WparO#>o?7Dm3pjtQmWZKV zx%Khm0gqeG@u8gMerop54?2c}U8-8?n}-q>@FZbpKZA!s+^|!72U_jEEn4K_ZH-w{}!1n|QjoyqTKx7IuJ0uFUuI zA~xARA==?@%-2}Ni*r8IRA{OzuE-ZQ6#ayxqS%zi10lSas>a`+cI%l<8C$=@z$Gg~ z=kNUZ{t89y0Be@8Q_tSPrq_QzzGHuA*9z0qK3r2}bb)vwx#PZ1!c6m68Q;%;mdVsv z@AbXEA1^vzkN>ZL$HIbixhE;GbUn8!M+D%&{e{dVfhja5D7qARKD|qKxgzsE*RRE= z&zOg=%*20d%*U}4PrM-rY@^max|kRkBqStY2ofU4opIS)8-E${4|J=iYt&0P_sn}Q z3qc9L0gz~A9=W!Lu6+<^0PB?lkb>I=#scb?WnNjV065fvwA1soovdmtp)G?r(Y%Tw zua_Ff&;w^)#4jeN&CLmMaaz!9+d<|s^P+C8S8Kk_Xj~R7ND@xt5uIU4Q!l8Ot{N~^ zR+P-UGgA=Zp`dkTNAaggoH@PUajGoS&5HB$c*3t_qWK~tf5E{tRX-~(@t27fSGa5y zT{(F`Q8?mT5qga^4NQiCZY62^U!}~GlC236RYJa{M)oha+d0TF-$SD*UU&#OJb{7rmv8~DVe znNAJu94}*R4^5Z8>tBHkhqME;2cFK7xt(k?#C2Q?EuKH*Z{TvHC4Da#Z(q#d7gFsR8}aJ2)P`X*Qy+k84}wz~Hq!$jdSzbpiC z47w_g9=wr|ADtHiT~S{xagpMCZ=wXhsnt#|G!?OKAOEzLBe>?wb{XsSghY98c7K$v z{kkpnBYNSkNG$j*3G1}W7^^Icd;kr67d{cG zYG5+^693o(j^u|?Bky9*o&D~Wm!GfEC&VQasxRGwR(3~oRc3>u!%wwNz9{DTdcvo3WoHv2A&*@9?& z=*XM0I(L6P59Cq3u9o}Ks3XqD{6D_ym>7u-li48@rK=V!DG$%$hA>66*Ma2Shl&C) zo|a#PmG$+bfv#bKoT)|~$*^tKCB&T6)KE*(!ncQkuUmkvZ2*=+p9;>R|$`chG-!yt~xHRU-ILu%(`5jPZH6Ddj{{-RFE7k!jK1&YsyqX@x^ zMkdvLBnhH0e0cp=Si@pJ+M(H!x3PTwFhOf{x)J)lrwf9^ipr7nD@S*2MT3Z9wy)1xF z4(~Q8&z&OhE{$GWYT&6lwZ`HCGP4t3=U|IZ!7gABDJ?>!M8R3NpH_+mwzwS+Q4h2% zb3YrME(Ox4U@S)HN*9+=N?pTVh+e7HNmb|KF1~b>j8m9t@uLiQixEe_fZq{!)#6G! z@KcHP;>A#LS0Uv~WLfEw5BU3vfBaYI2z1#0_8#?*y{EFvw9=uB?(I69r2*^Wq{aLw{N_Gw~`*&Z%X;|$>tCYMI z9H_rDKIl}VDc*gK*j%ûRT9*FI1HFq%+Y&mP5Ql}c_4F|r^k!E8)IMO1(_WjdT++_oH3 z&=8{g>0aX|I*URj@2*d27DhwAEPJ?u_!}E|k>`D$^*&<8ves(Xa>N>4-A11SYatUO zC3JlV@UEwMWb+@lXJl?Z;CK9V`SCTQosioJn``|Zi-IZ=bK4E*wq&+CGG@>XLlW*P zLIy^VUO|J*24vP9w>=Z2$!;K`vBe8|D6{Z4zz z)rH@^M07j0pE1lA)kLAyP%|6S;}ngXmD=xS70dDMkufX={%I75moC!=bUi+95&s)D z+_!aju6w^c=KU!xt*n3k!ejpJ?Dc*#9cOxJ6~WRKYDy9LVCQ4p0%6L_%e%|fDiY0C z0EDonEdIb0k^=u_U_PF%bb7J%iyyn+U^-Bi_=SvmX)EC4vLEk5zSR8uhwS?H zR;`fHwt4f5uFMW%4zLvVfTvnVhhC2Zdx>m5M`8+Dh+qJ3ZSx4#->enn` zJ@OwJm@E$u;~&7Lx{>$s`gv#yO<5Vh)$sk1$(oF?S6xP4w)e%|k>%&$H56~m9HXbgw!}nzv|3J{AeU}aXf!dD4X8KJIPisTV%#DDetOfA@O?Z_x5M)W5=utdNCN0Q(mlGlP5 zZ4PG%;i=IyX@9}(acQ#~GQm|eFn#9lWl5E@20{kkeZ;~joVtM<>O-z@G9xIB_!|lk z&M;qV};yDlT$n#~yj&e2gmfJHd#&8X=&<-VX*1^9xu!msD4n_bp)|ZjMV^ zbg>{-KB7d9Z2jaPnt*|9*t}>6Vjkd15?mVuhh+#j%LY4+4Gw{h9!5?Elc#t8+IKf?4?4i~GOO=XXlHMAV)EOwO5shVqV6(Mi24GX!{rO071PAvo z1AEMitUHd|9kyIPOJwtY9dCFxpqY*_#7C7~{`DL!_p<)3#zXIT9{uU$CVmxOyXVOM z4LvQ{`B?6zg#hd?lOUpfioBxkV16)}l!v+wNwk1f$FA*1)fflz#?CoC+R*yGpPyef zi#bk_DKz%`m*v*?PmOO^baw>&f!kCHv|%?tY1|lDNWeZ%J$*aa5XRzWiABbV*%JYCVfI}%TG&6&- zQ_xYW8JB22H~qOXh3lt?NM1y=Y9U!0sP-?eRSY-dY8{qIt4re{B{2_94VIp<(t0YM z_(!qXQ{_>puWj9DJVrMtC=&O6HO~6ja(|&^=$V~8Y7hZ|-o=E{nqPASlLXci2}z;- zvfD_~9Iz%72# zR#&?h5$%ZN{2(IF6Bc^S-(3^O0HQXOMQj2akLp+JQF#Tk*%?z3r3aYb@CHuR6a3yi;KAVddCITmb8ss znYT_aOOr)@Y4~W73|M^CZ{roolZQ*{#aq1L`x9F=iqxkVTpm>MWgiKOsO2=wiKu>7 z+cvgVR0|eRE-Ye#sYfesR^R}0DPY+~Ha}NH;K0)lI;Pk(kn`*D`{|nP00E}88 z6c_<}P9JP;E4*GlBJ!N`(~Q^E)rpCV3v8S%^#fXe#iWJI6$6?k#3Uq9VuHu8i1;iW zw0pq62JuI4si(&qx$G* z%JVOu;&r2Ba!FY1IWNu`CzOH92U%I(FUp;U`W0S{i+)`xNhHi9gDkz#7U6nDheeza zlxR~fcL0VZgxX$j^No&}7Nxf2aexFk@G2Y+clv-=k%>x}iPA?F>+N}fX?FfFq&$$b zZDNts&;V@v;R@RsN%a?Qbeq4vsy>+wcf|hdgw_v%+fusmi9TtgMP zmH`tU57dJoI?Z}YCXC&i`&I_wr>bihQJY|R$Z{~1qs+L-ht%}4eF4tHC4@!#2pccK z8TNf-Zcncs@!awvN%#X%y!g?vGEG(iwDYdf1cwYo*p_q|Yj~4q*Y@CqeD+9b9L>c3 zAKt_}3-{BV0-?(hb(96uPrkAdUW`QUuq*+IOx z-}LlOnEEVl#R>K^0YuE?vZ%8f<&^79mv_4P#F6vq-1eG$#Sc6$M+-t`eC z-vCWk>p`-Fb-|V z(V)4&$Y;u!G!$>)ZrRfiv8kCx%@#958p9^Jl({f|gD7r^PF&zd1-tDmZ(ZR|94TY7 zjvXDgnOUUtB}Y7MV*QzLnj1ZJIKD9BK?QgB|APORHQhHD2pEVP(cBNBs5?X^#AO%s z*CgY^y>DQnAOO?oL@jb{^CVa6-;t^Uc%wdoadE5)q56*> zq!x60CO>9n6i$H9(MQ*?oSpsy*Z~AYL4I#6->(B2k^90liKeEe#LC}4h2lE>1X7b_ z627MzNRhdEu3SXlG|9+r7Y{A4he}Zz#X(&N_v#&x9Y7mKuHI}U5nJTazFtzFm=YM6 z!@872PqxB(1{F7OP(qgxaO4Uu1mhWmnJKhvV8lQw_6Eu$J2#<(nXR%#6q5|!NUYto zD1c&eId&KUxZ8i!MZ7!aI0arZ-q;YAoQ|!XS=!s%op+jytyc%^D$Q>+Ms)Ap1HO?Q zG~LPvtF${G!wBL#1;#6ZPfis)QVcUTTW++O2kdqMQMYhlZ$_6WnjLAQE2zsECf4Hq zd}F@dqoq2pxAm#Gm>Qf6aM&%C3>dz>J=&zyU zmK=Nhzw4>s+Fx7<@F368`V*0Z@2EF7IE-_T<}ZhU!S)Z3vaeSzrD`=QH5>2mpSG^< zRt0Q-K%Tl=)7QT~*C@9ePg^~>cr~f2r^dtlEFp}M0Lq7nL7*Q|+k*cQ?F<;$Sh(&` zZWEXseUelzj89Jw*N#p730oFk{rOUS>W2Yf6NGHpcB8VR_KFJ4#SL6FJANx>Y|{%0 zx87it$u^BqT^`~6N8!5tN(~`goV${O-jZm5UVwh`bA4Yw{*Ru=2KkiWZlONQVOR-H zILR1Rkld722?M>z>Ksm~42zMqo+p3Qh)rzbI3NA$b1*P<0atmK;41G@k&>Md_@n86 z|3O^x>{{`1Q@fJS5K@SFd${{JFv{>J)~q~7)NpuwXctd5-5x7FnO&6%<8mGsj}sI8 zMZt|H#mGDm;DWrKw+PC3p5%V50Nc(Xz=G@Qc5<@Pcw;4^X~-*J^L$qcch19V=m|ld48@P=GLw)i2zSa3j&DU+TY_1LIG?~ znBU_)cTHQfmsO}4s(6od`z_+Yq?&@X@L5v-Yd8(LR&WO|C&5w2g9_d0`3-~9N)Vs6 zokPG)t}pQA;ULrSfS;AdB9y{r(S>8x3PC>2F-%R-*K$CPqKO4P%B z)(Z7)7uxOZVcct?&FxObArBaUm4a&q6dNruTO@}jPV#7^w+vublvV-tNr(}YVrgW|H4KfU$ZgaqMYIYWlaw{E) zt`fzHuC1z`h1+`43y%i830X&Hre%h%gBf7b`ZgDUE)e!NjbKZzl9jWDWIGQ0$DF zTD@K{S_Wv&oc~I)KhnUxnwKV5uoObFeg|>mQczxtzo5I9Wp7L+?k5kq^O=@#kice3 zADNX@D?N$pk3^gMoDib^uqfi6u`NwzFs7E0L`_uWx2L zKrsS_ni|o7zq#eF{Q}!OpfIW~KH+rum&*QRai+dcK@+JG83G_#%n(41g63z@(Wg3o zu%P%XE$512GFUkW{G!C}DHS>vj(TrCv!G6y{|oyuSOQ1BPK(#Q2QFCR08u7iT~&1t z$;NyYDyAPMr9E$8%l%67w;cl?p5XgW+57s$elO3lwtaa%)+X5(a>uMIEuR6gv|m99 z8_g!IScI`qV3<%Jh>eb0gK?Uu(53wR9x5YmO>hgk6ZEyNAL!~Q1*sl69E?!h{WR0b zEX+?CXHrZsmmgxvLDCjwif1mT9(S9INz8gQ;-lm3C44Vmf-Wv3s(A`NEBr5Fkb-|Y zN@ZGSnFOxYYAY6IWPV{H;kp#`avFz<7xZ7nSb^RRp;*0Bjzk{IX-Wt!q`}c&x8i>E zObSJst#+`~;*#_7A7zemKrgd$O3ZI|q?_|oYWF%cCaGjBI#XShN_lxBOCNn!nyDto zrD8wIFS9Ax3In%|2Nqn98JF+Kz(CkR-(v-zY)?s{Surqnm`^v!79u__FG4|7MPK!y zScl#WUVj?7-^!rXo~<<>+KGuZaH@E_!+YNIL$tW`7$&IbUujr1SHvlz1;=`RV<0DO z)o|c&a&kHYHKL6m;Y0B?`@7?Iyo54Av@~svrqu)ZgKOL@ooXi;qhMvaA{kTM( zwm9gqpo+_ZEJxN9PDsv1Co8habSvvh^D>rnT22fgqDFutYD9`{({}-W0e%D~jwTi; zVLomV?2^bwXt$g*?KTs3&Ed}sby_-_Nb|LZG^cL&$OYYuzb?_*-n=R{ui|vq4={CV zi#m{n0yY5!?&_j2CGZah?-~JVt*@pN@l!?HEw+yPh4B6T{W{G^DG>)oxuEy4w_;{M-J0gH#+~yX&KK08lS*9KeuOKO)_;$puj~(^?c> zhHh-)y+3_sni)~x7wquD?`2TlAjnVi=PP&4`xg|ooh^6_Q2Pr{Tf$a)y^9PN+jbw9 ziWVLf^z;ZlpKa^N0n&S$WYE9hJ_Zba^uri;;WmNLKDi#(LCM6#1T+`HKRe|AZZ_?u zmzcy98}3%HFI)SegqiDD4mSZ#Et1@3hdTC?z|n0*L`6pGNjns6LBKWA0MRUjyXmOi zx4n3AL;V^+G@pCKKe%XJU&$8o*w52vL|s21PZc04*$SE!eUS815I49ovxrNXi?RC` zPUh0Q5gItLO1}5buS|8Q6n4 z7#bWh7PviR$Sg21(xiJ0riq1i-$8>}7ULQ=_T{K|2vyMiS3}E!+e_IrmnJJ&Q9=&~ zwhwHWT+l*d6?*n5aivIoqmM>tXLLdrV<<|-GZUu_)63Yj`pA)JfdidrQ9Q9IC{NB* z;|V9%)|xH&G}x{h?s|Qaz3=)RDydT}R4N zO9tEr(ES5COqXRB3!vVAlG`TK@p^Gswx5M54a8InuXDkoBn z>^h}m8N#u9SHhFI2wm}C0QmDA9KXK^+Wbpp&%jHc4_%QxbD~`2I2dy7i3m|1SbMe* z9#~0?==-2p-AJd`N(ISj>HeuMLVmmZuWR}_Eyryz6##oGSuIdiM0~q{XC$2#y7YQU z=$Zcvs$aY_>)K-ydSN!mA-~eFR*xi1du1Jq zVpx^xUu9@$NW{sBrQa9I7$n73u0L8OfqS;pfcWxs*N`7H;{XUNgaAx6g`y6pGadi0 z4b4?b_L5*F%-sO<_`(65s;(?j#}jFLNV?{G0QWPSk=P%mSuFW!OBd_-8T1J!KekR7 zw}cNRIVD*xCe6Ru7xFJU#w72g5Nn(R(WtHNB7H$fsG!~t;8v&pCmH(3^}Qv4_*)FxI9@ zx*zba$H$;m%Zwbiv9V#RtIMyJ@a4N4AaMpzr7}}uhvhF2fT6WZGEi}YINOZV3_DFt z1Bt1Ut6yN{9*-Di9kx#Vb4{l=q(h`}#h1x#q%P6H73byt_x+LsQ6bp;kZ=)`H&3|3 z*~=yp`60Y_swQF?et%V7S#Wb}-{9p6l7q*Uezgr{OcbRIE1 z=WkI@UzfU78Wd^5x!>GdFOMN_lyv_6R%=e+63sa1amOt$s9;(b^fz~f9nQAHtG z{f&c}85N*O?9Rkqu@J{eNJvCxWLRCe8dJI-l{HxR<2kLjUX67F7NT6;kp`cPk-G zyByxdL5SLEqMz3$&ajP62EodDl4SpG&M_F`;P#0z^7Z~mV!dCc?+lh@h{qe( z;|ir{V&pr%hVkcVX%5dz z1JTNPhnJnb^v-+6J$aON(aNODr}>|DXDyg>W2xWGnE$;t05AIoDHLq}@qr2^g^no# zgb#lQ8HeMZP8T2^9%%*^-d=|^$*>5&`#s5a3s@SV@}(X88r-}xg<&NohV1R_O*1uN zim&nZ1}xg1&llu0f^j9F56nYSSm0k783*poZ30N^!Vi0O9g~N1SjM{X+ZRqq!D zppdqdfwZ3lRp)DB7pp#Y6HGUNVeqG(b3VdCtF~D>TLOuQyOY&YrxSQ=Q@}( zY6ft}`Gegbe+-Z{8NeNSC>}S)n%9|D8@c3*re=Li{Usv7H(ai?&khdtyp-xo0e|iA zpqKqQjCyGHU)0^-XYZ4UgbSLiv20sBW7jQkTFBH}fj9uH>KD5(Iu2m<`0Ubk)_Lg% z!nSe>Wl`~PJ8%c@eD0gbi~R?@fw3lF%89^GL?kzL8S(NmOi<-!>_K9;I^iGS?^V!S#oFjk+|3X)eOIADxuUEc6O3ObQ|D-MXa z$7@zKXJ*Da{!Mo)`gYu=;P&{x>KQxMiIj3kjIoxWss@L_+ugey;Bx&ZTa7F#79Ew z^_!eRnTmSX+)iHKiKqP2^UJ(r2LoW4^#g=fntBfCJ**QVNbF9}^|j_1w~u=}v%m|j zuJl`h4ajCA{KvKxHNS_r-{u_qUs%80o181xV~h}^$uIL8`g|B&a2Trt!qOoLFtOv3XBQh>Dh2hil+#N(E$vdWmIU`gCI%O2y!6y|&RHPjs-{iu(y z9f-U~nCoPZPLo!3Lu0 zoc0^1S2@{bZgIB%j@E42 zEZ&!$Uy0HDH4UY%w4iXSw8D0f0frt~et9bbgo5~~`@2M|s!iz*dx~rz;`?uj? zP3QPm>|>vg7eQ~Y50@qP<_{mHL)cQu6ejv4Kkky0IWf53bw@a_SM^hg4Y5SGkT~J0 z-FUre1WxXS?iUsVvpsPCG!4vyt5Eaz0q2o*kn_=81-tVeLj=$;;sgDZM#(Q{EwA(S z&fr}C#eYpzH{w!yowCJv1%dlHM7ql^mib~WJdmRM5?DCX8TH?*l&bjv?j>8m*fjYb z4NL^*M()q+c=^qw(P&c-9%G9U2XH27qYvs=J|M~COmQO!@VZW@fY#W=^&QO7K^&N< z7t2vFk)blAQGdVHPIRk?Hg5wQu4|27eZjWD8dQc`Cu6&aGmVRCw}e_Y_VB9)u_SE<+moqzv-T4LaX2&>gn8Q|%Y z`zNvBvzSCipO@2(Yuy2Wq&z8s8sdW%u`_NR0M}pvr4!$648CHA=L@cb@l-nMR#=V! z1g*RcU@oB#_=l0t?ey>>a{&A@wWY=AooT-XUyuP#Kk3+<4nY6=A3sZJ5g zk{uAuX+}*%8!`!s$&h@dQXkl=N49hjmW7uwySkDCb|S*F>92VwdgoLp!)^=w zdE)+90XQC{hXq|2k!}>;W)!6Ab0zYW0>7PN)Mb z;cW5A^H__rg!8rrPhH${+ynSDG$v&CvH zmzD2?xQJz8s1$}uI#P+s6`ozhRl={xHXfiCOGGO$!Loz4y=`!TJHYrAiQkDv&SU zed=O3xpe=_5r%oT{>zi=rWNA%F8{6Vg$0cD+;Si-+k+C`2A_z+-W;_{olSQ_VwfZ%!mSbx@~i zs`$x1SK&EMl^T?dJz5gpsgZ%pn%ZI{oif8Ys6CRG(8WiMjj*#LVPNMWAo(Mf4M)8Z ziT@Hq9`wx(NBmf=aQ8wOfpkGp@FNZD!NI%1z*^(xLP`(-d_0T4$w-wv(Qtw>iq zUP%Oa(ZN+Fv@?2v2GhaXz~h_&DSM*LgdOr=$AoMXH^yRSvH0vP*6#;6ry-T?M1e#} zA%E^M7P_A^K19`Vsfq_<#$pHRu;)^HGIxv!=-Iv_hV{?c1d+i&Ku=v+K;6>k?m>|p z-6gET1ztG~??G%mP-p#`g8huw7qvz4hf~nq1cEmlV;VN{H)BliYVDCsnXb&ZoovUs zRVj&!xaKh!{k#ak2btT)b|%oBktU!0u?>XF13&VVRFS}zJ!;Qwos?XjW% z?t$>-5QHI^i!{>|L-IAE83@H8*N}AmGZBcRXW8dfv7Dp@=$KTDayG?6(yi|^<*r-5 zq0cncQ7$M3=fYoHw@LMUJ$b)*w%(;19JjyGM}3qrE$9lHvovA=NX?)%g^;}QmJg`j z_dgPyjOj?&UvAvV5Wf1F{JQ-hpr4kGLcsq0PBA~KVa14oaN{-pD#j#)e1*uedR& z>($2N1;h2iQ=F4f$Sfqeohp{37M;{-RQYwa^=(UtpxzVy*q7_xA0slg{u5`JXxK;{BJn*snz>^O@ z3SG;U6Qqeb)+}pL#*;;AV)(-5`m+czJ#st!@eo0^j*01C=c=6hUS0&mwjPlkb#hTL z!1@8LcIPDq1%`3vbL2snWEE&jwD%f`eT zp1|Tt8+ubn-%no0@inVWQX?s}W*JE)D!aKDjSncMnQ7{@B!j=KMDt%|)?JepI63$s z$jS2ZVy!B%z7u8b&lRvYEBpzf{p0j2_g`J13Wgym#t2bO!SZRdWtsutjVJY@Z3f!< zTR!haObfbpV)4V32ep??+f_)Uf68{IXs4 z3%Bl9;m2Os#=K?AO6mj$Kl_46j$tzECW`o#6zMS%DKtW2FA}*kw~lWCdJl zSAhJc*33$EB?qkqz*4Zh;eSRcOnnxQXM4k%~#9u&Xs)TuuiA;HC=1tr#mVb=rXP7Tn>YG_SIbt<99v{9vjEY7Twjz51D~ex& z;UdG)JdUxc)+_WMRD%Sq0p2(FUJcG?YYhK-F6nQ{OL}T{dhM z1ZPc}ElU;-MOAzOkc4R7BVgyky*03x*Oylzj-hZH3?Tjb{kvXSdWiQ6gen-z3a+;j z-RbnhFfI+LKt$6?r7{h4Ccrt~aT0{?W4p+o6?6st8&6I?Y)*BqD=1&BSzO^AwDX|ygjN-yd z$IL6_5?KuUw*h>$#by=)ykzc!>UG7+H`@y}nq(}cbOWNRsz*|XuCVtc2gOO`lau7= z+aLh^Fb7i}Qr4q~{=qY3Vd~$q4UCJnwum2sEo&5!c!>oslvM>M`*X!ptH=Y=H_T;K zmRG$K3s2t?zMbb#?sbQR)4)Ie6*J&Hz}d5+0d?JTyjuQ(Ri5%q*e2n?aapENn9 zn1zFITM*!EN&a<F>AW8ysv4uJ_ z%-=8NZjV5nO9iT~V$cj%-n<7$cYpAlmcL6X0)(A;z&94TKXriCYUR=KEDCP4R7rzoIM8b}eQXZ02@HUG1zts*a%Q(Ghht1n+9Y1iWlVNlHgOLyuz z5w%44GeK67tg1sFAIm7_muyqyuYuL}DVoTG7|QrxrKPse*8^hJRZ~>!9$QnN<05tS z^_e(aX3e*VchqECw3+yxN|jxfcXDf_g3uN?sGGBDOO^2?VLWS@$!+hgw=LBoj67oU zs3p|l#J|Ga?N9DHGyyO-DtKZ$Z9dfIj19Sfa}`xCD+VnHJET(E{HX2f=}sl)q&csjkp^gg7Q*>4>IntmL}6L zkL7{51}Wx#Sydr;KpgvlO5P2Ot9)es@#|$^Vql@DHfCO5o;J3&`~bus(P`C%!gV6T zkd>N&mR4$#3k=S0Xo+bgfT;x;Z_mc9w~J@;NBP*mJQfmP{sMSlse%r*<_v%(|IWrARm+D1N4ZXG_P-vo0_8M+tX|E2!;o2 zY@)2;>IE&yC_LCHRv>5;3RyOKh42SA>X)quO)|;QFXDq}Px(|1wzH*Gu30^NRPZ*K zx}l00k>iemTVEwmzR0xb03C@ecy~oM(XdI>36#kdAhx0!p4oKbAeUmL3B;PQ3Sqrc zzVNp*Gzmxb1`abjb?4o5kqOjyd(F-b`&W_5`Ra3usnp%xOVy1nIG+geIi^+8mz3Ykv8k(_UV_MWZ8P;`Lyp zsS-rNfrw^r1B#AmfB&BBkdqit1ZjjdQ7)MlXN-9IQHm4HU44eD+?V_S z1K|W7EIXyhHxbARtgqvU$~#nROfQ|d=u>w+$t&;>Tc|Vr7@vj>N_1QuUmS=+-j`V% z;#}Jyc^cNr)tN(=7t$aO#-YYROWR6|IL*3X@r$|7TbWdxemWT$16sC`8WMeAS>9(X z)Ek6r#=$L_Ko1@(e|0=obC+Gri66nj0N^46-jM*s zkVBuq^PUJy`d@ax8ofXBdNZ4>VePpnsb~)dyaX_9Kq@;5_PMi;0P_S5piukNYP#@v zP!fmcu>#Ld6v-NWGm*spW`?zXtvG^RSnV^cS)F5+fswfsv1%HJ_+&gxVL{+=oV}JX2Qn->S`WVfU zjOfT)g=vG?N9dveNiCy7Ov3&9!Pl_xdqu8)F$3gpy`XdsWGIXY2o__fgnjYGR+^E+R z?+w?z55lO6re{LANYH*C-n@4QqMtD>E)nH#upDyh#X`RQxpqknXKw_?TmE~zv?Q57 zAvN27>YDP~zI$Ao#+h-+wZh$t5DE#;x>Sqv7&AASxgwA57yH2Ki_@-M`>bEBk}Xj%4{>SawHK_9p60VFPGJ zwb6XWWN?{F>tG%Ro_?l{!aY%t*PB04xCDQY@K&k5QbmQF&y8OwT-2!UR)kqmzRPkra)F2Rw_bf zr_laOJO+)aN!r*}0Lj`8+-_gN$1mB@K*#=poO)Whxz}05&Oa%G0VaWKEv{L=QMO2nm+q5fv2l6-PWl?q9aVCQDhAjtVuW z^{{!lnMo6dWTyjYvaCcB&qlyhT8A_RS!$b6QW@>7{-Zppvb+p8x`gNJ zg8Us`eMAZe#H-B(-t5)&2jG6of$kaJf9ZN36WWbrVSt-V<;(fvRp8)1BV!)MaZCPk zx_!X>cNarfl2YC5LFe(ig^?835lsmgS^EWEIRCfDeCTV5_)#EU-AL+-7sTNJ632mq7e=qmk_r5m4X_Y%xbMthby@uJPp0XHDXmMg2a~!`P9L z#ACGJt)Y1~{kLq!c+hV5Psk&>xNkjC$>5;KYoC?cx<(Nimo*ZgCo%bnIXt$y)MnWa zg)H65MlYTv+@q^IlEtvN_wgsu@Q-bI1GhA?6OTvBpd3=K*VtoODWr3I;42yfJPTqeQl_U-y_4yy`2^t~-?goFExdn%{E+cgGEhq8Y{ZP^|w%p z?66UtO=Y0_1aook@zHbUMhqpX)O9U}n^0f(E+`_~%hZX16cUul1< z?ig{9;Co*Xdu%!H>tjM`#y?rEK&pch5X!nhI;Rq_%I+wtZOVH%jDtK?xAT;ofmi2A z&ls8)gD2KwLy>}n6%H;gJ^{rA_xEC3TidS=he~A8^D1}9xrh&pQSX4!NK=`#=8Uld z*SB^)SW5Q{@Vxaly8J(6&B}N42TvvQy0N9Z9#ZAz=GrT*6d%#3l>Q0D zV*cuWZ~gp8Lv~d?B@&KX6uHV5BSK98zKVqF)IB!H2Og>lmqX!FWDaFzYf*fXb8XM| zVBXesOxv-mvsDO(N_L6(@89sktcx=WZ@2MP`4&^P2e{N0bB!k_bIOY|3GwxjeGWH6 zk=?%|`0u@6WpF%K+1E1JXoh?EVj5F($D$2sU;px(l*|DlaVptaGj@^{qhVnK+6|v? zdV2wll7Lk~kel@y3z+LvR#dfCaISA1rzWK{ z-Am&ipD^%2+BU=4)w^j?-LY7)6E5~F5{PjmJalLeeCWjb>TIs7vBjxpZWn})YqEnb zR7^>uirHZgIsh304LqabN2&yq=`H$4@-d{=arr#8qSDu)9ZIc6bTv{Zf^%He%@Skg z!t&^POM^=Dz-Eq(JMo$Tcfj56w3_u5NLHpikNY|I`#BNk0esI{E9ArRbV6EMsnMS* zSAuOPPmsa2uXsf&tJz`(Vzi?;SkWeNOzpBsaX{7KT@m5l2Ns6Tl}X49ryMfh?q?6hoFn zgFu(#Pe((}gBP>%iF|+1M$u|VK4Qe)l473J8|(IBZDh%LxB6!3X#C1~$Cu?0j1$7_ z!;db+kJPVh-q~wU0|VLM87jCO1--K;2yIbw5Vp>5i_V5-@^G7_20Ez@*YY3g#28)6 z#GJWKv5?$zIe2K~|J8_afW`N(}brgokjN50<)*;Qj(tb12i-wq|eZA zZ^CezrQn|&Q6<{(1Vx^w5LBd=MzZIAU({m%A;S0P)34H9VS*yCLr9SdgW=#d8>!2i6YYdWyw8%IkmnZ&V4D2(P#zAOam3mZMW)ao3Rh|1koo%M{upyZ zJ6sg3Srdl^4U|iWJL9V6&0yG}!@)tvf`E@Ek*z*v?r}XSn#g4Mq26*O*ar$0o3|Mt zLFT}k>bKv_YB~e@<&qYAvP)TRQTb#s)5OXa(9|e4sR_}(CbwMVXObR+Wi}+lqDw?R z+Zt00y0fanRXn$=rHlrJ>bDXk;R~4{{ApaPv!wv7DcS~>roZt1y8SZINyXb)KR#JX zVXr+nSVi43oKQxwWJ+GF8>TFiyUxiB=4kJn{*Hu@HFfAcN|5ovstLUB&{X3Gl@jnI zv*ZT|z@E6)_C;-DF&t41`*&VLPIL}Sw#SBJv0D+o-gRrZ$jlTIokxk*Ua~MSS^AT_ zt`kuW*C?Agvzist(-W%aFSU4ycm~(^F^y9+%IXfrXlSda!a)}URTxJ(@khFR^#i2F zYU)p>b5n5{m=`sp#!+)sqLXJ%i=%ED{dYgGX(qGGEHGJ8`MKu@$WgoG-7L3mt*RzTsX_5CXU~p{{5LQuJDlNbIKE45-C17i6E{WVu<()=RQBl$O%x=sdH|GhZl=Cx*k^5UIi!d384|?F+j!1?@d!8>p+HP&# z%*r#>EXK%Ag|)Rj8V##086x+ai#A7*{9sJawPg}#>n{bZ*5HpO7qMFJqzKlSD|d;w zpqWSQP;GnLrPFD4Hn~=ICRG<$>PS#W%f4uDr%AD%wDQQey`gGYM31M5ewdk*`Yop8 zp|PxEvDg(8Xf%rUwufw>)l#11KV?8oYMQ{?LT0Ev{wzVuyli?KU^A!S>+W zKLqh8F*{d8$pj4<_X#)%)oK{@=^`dLgWkRMsVN%DTs;>F^1VYDlV@9xA2Y$FmuH)4 z-8O;C$T?!{x1@fP`+3dQ=d{H6IkzaL&mC%mjw|wY>oYxy#L+G~toVrJR(rU=tilBS zFeTQ|^9I(3qL2k}QyRW7!E1@-+Sm(1{6P7*zV$ss)daEVSRi5&0-JfGLE+fG`|AG;Cxdj1wEq96AB)% zxloYch1Zx>`6}kr5xZpT-@T5L+78k4c0kro6MA z8<&w0qSrBm||v!YmysS#cOqU(1i6O_H#Rm?$q)zq&tng77;Lt zSIU65sn{L;Yr!h}DQqUq7t%E*p_(v)brNMS4iEgjZqYHWN=D!)#rK(H*g&PM$y$i? z0%L)%C!c*=CSUN8&7NDFpYAxOxXCZ)A=4Kp`|)owDn5!5z=~)4vW6-$k9K}i>|yffj!6r=9$8^e$$M^4iAk0uOvz|`~@kwO-G_<@r00 z@5kNpr*rImQIh|>tP4f?E)j7(KLvgR4OOQ9`=W`wA`*P_eZr`byc>#$tXi(TxWSWN zc|t-YIehSo3p{AD+nZ>0v7-w)S*+`3B>wptMvB@$d#Z`lKP%GsW4=PHhmnMO&6)H~ zXtC@qb>kE3gy%|2Cu?rs3rV=!ov$K)X0w;RY$#bc>&ZoH62DA`EjnjaRbz^-6!FlF zIUe5n7qm|izY*5azK|M+4Z5(&bKs~s>F@m z49ZphuBV7?%iWJj#3;R<_%PSw5Bp$VMMz}L#infbp_LZsWWV)R0 zczOA|Xs7mq9aqM{PHEy7sy`V-2bg<{bsOT)Ou$LhWHG}>5g`>qgR=|@ZFZty9EiW2 zU5Q#Q9iS*0ll<&V;qgR$74I9BZjfu!jm#AqdB^OM>PUjOu_jDN!8mR(@`8UYGO#rx zjxmIxUbWEJy2*$ejZ5q-*$fY zu^vYWV=u1Zj&yZ;?WfCk9A=DB^sE5Jx%=lgnD?;&^D!n<+_TwuD5^V6pv=hCaVo4 z-2vXM?W_myht2!;F6|xe5|bWAhD7)7sF+CadlD}~K2Bi`(T=gS4Q%XSlMiS~39$w@ zbqM6Lif&WctTQyzTAOO80ecy;TrDmKrf8LUk*%^ym3QZVB_%ZQetlT>A?~r^f9LCV zH!DbVdVV(0YNJRayR#X3D2qV)(v_WJR3_LQIj}|Ih5h#|{-|VGOMLELUc(BJQ<95( z`r|~bVnOYtgia-4atG9h!L9BX$!VPo_O5x2QcXg>EVB`_6AWouwimLN|WZz=JPOI-YLElQ|}V2;W=4koqB7} zOv>0$R4n8qG(;j<;@`?L)Euy!3eV8S&TQx2IZif_ZRcmi;x!zqmt41&(hQCS z2(gc&naVyJM4B@W*i?_X&_ zt0Cs}-Bc#c`N|C8zd%*1NF)5g#C#7!>VjsUd40|s5;fYWyCZd*F(AtLVvnzYgjy!y z-U6ezYozl!o1_KF$}6V0oY^c9Cpog<;(~@#?NyO{V6G>7d(Y5sL1<;;Og`7g{a)~M z`(i@Si23vg8xhK*fB*jKeLCK>M#v&0gpW5B5Y*<;YKID2tk%BO#amOmTr`=PztQCs z$uOFcA(G<_1HHjvELKXEI4&R1&P9j*mQbcuuqsW9zEK-~0N5|U_CL3UEEN)VKN&=E z_FNob8|k(=wYT{DRYie8TV%@SNMe*D($U&_t=-7uClJ6#57k(jE8r^Kq0eAy$+Yz! zn13F>`v468BV&YSEdHzx2~u~dZG(7yyn$7>&H4YM>m8%R>Y{Gpu(561wv)ze8XIlQ z#pOYg_m1)Xxc5(fbYwK=oW1wjYpprgoSvuRvU?~7JyL9`&wRN{ z4YXOcgEpinZKntsgg>~?AuVw{>T|G430^n9vLdr8r=(<2p)}hMqS-}&O;T9^CIyX1RLfM04fFHVKohBxq$2Xc?_p6e>W{USY2^c_MNv)aLcnUuj*AC&KbE`!a2EQpZS$Ka|q#tCE&{m8EuUE6emf@veFU| z0RA~Z%KT($6CY+wnT^GuWm16?aQ0cchy;9r0Wx6G@1;4pXoG2`6hbEUN>$Xbu|*Gv zvbBDY2U+CiJ9Bk^H8COVtuU=}ThnISXhjBze z-!0%+|Kv{~Q}IHZ86_${cjG9h*5!rhjZDwGBRc%NFqGUahtmlNyKSd}A;F7V8QEn) z>)6sJqY)Gqd<7`vwn90$lr9GjS3>N*%y@YpbW-f7PkP6O$c|`*K7}T&1bSrZ% zW0OOYt5@RWXy!V$s@99mi(A{*r9|VKRM&aR12_Rl930m zwMHbrT-bn`zZMSusbY@#<@&wK=_;r^AY%S{b$s&ceAGr9hBNep6JfC9+WBpBHOCT) z@Mtj%*0n1a(MZ=sC%ngm&`T&9q2*3wdr{5hT%|-djt{BJ2EiGPz?X52bd|!uK$zb6 z5@#Em7#OcrWM@7iTkitAnO+T#-Le>I#g-f|a7p9zQ1{Q{7xuZs#k)pIIy@e1;IWxO z1O){j-(N1K;uPd zzb!0G^LeYI$M-}xLuV8bd|PpOi)8ybNkach88^`HG5WOLZXfXF@YcWs{%{3KFN?6} z+oV*EDXN0LlbET4w~YlMKL2U=u47*XH7spZHnMTcz#a;hp&VC9$P~ zU^pGo3<}nz`HrDlZlQKxllhuMBI}!&q1;RfT6S!GsEK!l(C4IIF%NNHZ`w@`N_xO+ z)0A`~`1`VDL^6r9_x?#IXo`tEwd65-4W zPK*+BYyAuBWoIWs!UV-xs>9o|fqEUZQ`gJNwB3H1O_zy!=%-dtgoR$a9+JV4XVl)cPsthV9I9NAW6UJ7y| z@FrN^xDT%nQe1Xs1; z*@okJl2WV`dpFc&A9lTNgl?OQZDKwuWWMUCp!obN4+0;!o2N_|Kic@s88O(Bmlgw? zFtozMQ~tY?xzKu;m&}9rk9w((5Vqm4eVC=6u!JPgFoUfiMssImWc+ob_MB2%t*a%2 zkGB$2tp1KHB_q=VNX1c_f2(bDeYroYHthXkp5xEm(RtF(6}5XDwL&hIkp8<^?tokw zWwDmZnhU;aD2q6n!C#Rb6Ig?HNC?urysHT)S^L{Qm^}AN1*X*lh3w#aYc0I*8?!QU zP31bRh^1Usv*sHbA8O zkOl465!Kn88GsXHXe-EZZQdQn?_dZA8NX>g*T_`1y7~Q z0X^)089xe;@gM!hvFZ*a^cbN|0#ebXfCFIA4;)bU?Gl_<mn@ryg82ZuIqxRX;Y-&~)`_Au7RCbcaNO?t|x?#Tv zttNr})nFt7j!A@?=zmLe&IZQfVstg;xD%UPC~s9~HV+?=b*i``Ykfr~4Nk!4rebMw z`gC_+mU_6+nxfWGfE72z@>{@%Wa6J!R0nyeu~%_u+qFN)MuKA+s|O+?;k#Iz;nl`+ z$j!#r7{*MDYO2UXN#BL&fF<3>OIT5AVZu>KoySmiM@Kl6S+dw61H~}m{rRA$prN)* z`R+$p?n2ztwX?0=18O%cM;=Q|MCPv()2MM*N*?zQ{kH+UA zUAn!!O`J$)H#eFe(`5Z48ACXYv<_&IRXOa*NDMpFe)$fAXT4MdX=rE&_$^^=o}Doq z>A`J3Zs;G1y7MI3ol>t!tutFPI}@H?p3R2YJVubErJEk=k^L7^&FGMkp;v7o_UW}E z3CeIPozLhrJ@z&xh4_WA-xX1Vr_J}x>fXTdTuGpdYlC?AzC0XiyYJM=0bchaHp zk#rkBA1fe&k0<(dHw6o#<84clUaLlKYRlEt)!-%A>GIFd)DkI1TG$^n0~NNglGj2s z#V2S!y@ONp>odUXJA~gp@S{`4lUZQSU@jjC~{2nc#gd$PDXE( zLMMK2#622;YK|LuIvPda_klW#EGuvXJV5Wtod@ML6?DnG8 zA+Bu-41#y)NNA)m>k;8Wk(tLhPK}9JH~gA(<;;m6COUFZV=+wvkov(#_k;tfyqym+ zHoH50#gwe_`0e>B*VQ7~8b__&@hMMaDz`LU3=?JhCI5gRpAOZ=Z}|839}If}odjL` zXF|UEWsch$*KuN5-7U79C%;kpsu9Ua^D>CLvJY1366`iZr-nutDLoQ#^i|jFz4~alL(`tcf??Ay^_IFDh~Kgu1{=sU7oD39Tl=C3zt6WHN1Zg@Fh7+3sx35^ zA#?FJ>KC`oXitsk-_XP)j(M?O^WA6fT1zAeZ2``uFb;4oo8W!rXp(^f>N^i}R&&^$ zwbxMIfJ)3xsd{BDP)2F>8Zdz+mo7F~r>k}R#o8H+83gF8pootyZ_iGum3nKcXlkWC z@eNIgIIQBhSS~&5tpQ&USpN!9?(UDJlDVngv?R;joh%I2n2!DfGL?7hA{i zi)Qm@*!lWMVm6-mZj@#TLk-c0c9&N{Y{St_al4@mZZlU=f8*ErfRfA=Cg?N0-xMUh zT+OF>d>O+jCw#EpfECiWW-@wTJNs9PByefrAtNwVgtYGM)uQHM%%n0iJ#}XfE2Tx+ zE}d2}=MTnWcwC1{@#}+_J=%#G1!oV+4=LG)60)~;KD5EB9`R4T1+DvrQF*l*duyZf zSF-a5SM$X`lkp$HW-KJ z0%Jb7pk>XFaUukp@+?5{a+Lq)biOzy*Xt@{QTuU@Y2+Sr{=i=Dk4BYu8ZbCLJeYB} zJjoS~c$>w-JB=bLQcDLUclF0?YQj-bW^*3*TeArI z+DzcK!bhW&V_um6Y_q~YwZJSUAz1ZYF4hTXA^8c7jf|8^)fl9+0$@}rIX}+&gyM<3 zZczZaL&hjs!Wh7I9bjjX4|ePAWV}*O-_9rsMH#^U>&+VEb2(Rq5;l?3cR!+NrYj|Y zw723g!(ba@#e~$v-`!<&3tH@r1~VOz?x0DkOyY&z)}SE|BKTYzZ(_T{A^qR{H%?CK zh_|9=haCd$Fp&yg&m?4GFQ7F<(!S%_0^iowXA`E-sCI6bYE{_0!?<>mhOv3=K;C)n zr%kqh??_@_-|lX@%6Zqm2B)W|&k_NKvEdXB4mAJ1(}Yp*oF7CNssxjUkI;a5D#oR! zM@VMl2C_ewY0@F!JfiDmvo2}vm)5O0Mlg|5RS&UH98Q6#Uf={)+4jW!;_WYKs##yG1NS#XK`!yesd4o6adsy@{>w;iGj(tFt3;UKY$k< zAhQ_~Unt*zK}g7W1bP9Cx|QvW1*T=lkwmAy{-gnPy<%j>R~l_&J}0!UK~l~Lcz;P< zvnLaoAuMgQIF(Mt6HoHrzy~9+waH5(53)Y*Jq#f^q)88_3(2>p9r$z4i;HEz0Vp~K zsNYVJg7IjKkg{;g&9vQv=STTO)zeGeTV8v;9++bRRyJP1Ri5PWEEngFvlV`F<$L_; zzHFd<>Hc_r;Xd*pi_3Y=(HA%w!y+UH!_a*zDcyJu;>4vRf9cgkH!tuP z@eK$4{t+n1(k|A2Ors}NG`UAd7cP+a6Uk9*2lw99O@cLkrY_rlj3DKY^O`oVffbXw z6>qGcezfMf6A`LGfQux6nj){0>Ju9v0*8(hsvdT_5Fn9rdp=f?(_FpYOwtJ+bFI6A z10E}}CbXo|T$dYC^e5eV0~ehif~;(8-69{0vgxLRyC$KFNuwMwqbW?lREsww}gA zKzM%scyY>Oa4o8To%<3Rm0c+;Rd3+cPpU|q^VMK)`A(zRIrDj0i^`+X+E1wep^1p8 zx11y7`Ue*SJ%PF))4Y45wutvx<5$=%)fnUWUK*mGbR%lr-*D*I#4s za8(-zebgz)82^Tn6V9)}#56h%#1lld2K$xCILaq3ah;EZ{!P?pAu8s5Z4)A!y~^YMVPRV9U7^F~l_Nkj#8+M4>${@7X-(i@H7mCo zCmpa(R&=!cWI#sA3`RF!)_ZU=Z|P?^ZhZqBlIu;I^_C#Ou{SqwjXq^E^9uLN?O9&O z!j4DOYjVAvi2zKd^kc00T+aX7t2V(Id&;FL;V zs{8J4OUp6YDY~+v*>85#bQI2vgY()S{FK8d4@Nb*;rN1#EL|l)&; z&9b|;S9Yl54vTfdy74)Kz@je5t*|7>rMn;30g;5?a|2#xd{~BUu4ow;}fD=M9BiPwdwDW!fa|Pwg z|F40x{qq!|)pCLHbg33mJ<5(Jkwb5)aU@%ST6MJ(BP6qX1$=N*OncAGE>LN3tt_Zfy(;{W zKkN>>QdWyqBD6n@vN+<(?@)o@8m#um;s9>D$)d9Jce`*7E2-VnF-N4UeQy31?PMk0 z8kyJQz0Q$oYev!T6zcZx!EX23ouZE5Zx2RfX3^G1{!fSN1w*BB6CdXx*;RJHa@Xjd z%xfs@W{^RuVn`Dk(1RIMePB`1oCnv>_;dZ(xzdZ#YE7p#6?#&kt*C7O%ki_3mz;#9 zgK@ft&CaynsnnGKivKhQ*DD~-juJ;NN`Tso?Qg{lo6dI`V2$!OQgWHkI8R@Xdu!jg zO@ax;=L@pS5hKLx9Ge>5)s7ErNa9-aUCM6VtsWV`Jroid-ZpokX}AeIXcYBQ{a4#^ zk9;2ep1 z5%Nv*v^k)4^?2EfQ~<@}ZPv08_Oc43gu7Q`eh^C$=?Tq!SBW&=W~WhKd&hAHPUWqs z%k|oE(@=@$Ne`QSfI>+ts3emqbf={K-RK_7FFe|p{zggoLH^b2zEdou`aFI7*n|2!<1G*=@r$oNW!+mPcVk|1K#k=K1KcIl{+0HAu57M zcMiws?Lcfxd6J;fJN)Y^(G~ArvI+MSU(QwH7wWOj2sl~B-dX@#Lx8UCGYPWNj_2;s z{vFdN;7BfD%(vlw>3Qj9vc?@a7SUQJC=_DBu(|HgH21tUsU@cZT3E?B5%9JonQ^Is z_Y(+$L6zPNq;^D>2fjaBTWK-tDt1j$tMpzopgdQtu9nl#R~4!<&SA<6d-x{Z6HjGy zK5u7CfQe@e2TH$`(V35+r zQ-?YrVx>{B4vh-e`9K)D+-^>y+elGH>q2 zTMCEH4U{-w#A>2wKpYE5Yq^m73O*S17PJ4<%gnM+`&f_; zPlF<`+3ZoT;f6fq$eheJ6s~Fa2~2M=eSC?0-f=uD8;@=#mC{WbfUM_tcvqy`g#i*z zhz6fm^{rT)?acD>nkvnU%W&z^%-=e(#0+1i%BJA5L!%2>J}iuk zQoi1x_a%?z?ysf+t!^#=1RpVWNN*xx`X{#Hj^duaLJ{-|U>Df!yg-u^D#n*^0G;=9 zr;>SDI{EG2#xtwqv5M=PDsHnDKtaTf>Dt(8;1cZQ~X8CnOn>Gow=NMINGD0&Ow?=E5yj^GpYp@6nR_2QhN-2z26C zd^X0mgknC(X9A8}9CpjqAqyoM2|$pY!<3pGir?W7`B6Exnj+hq(v->PLid+<@u8qh zZ5B5cGcN zZo$FUNQ$A|y^FsOlzvl4vce@RPNu6MieT6mr6VljK4&e`-Q8|7oe6Q>r%g zurcxyav5#h{ z%q-Kky+=Z^*1%oebNvksZ;jyH?dH1>I!|;_ABhNC`0&rQI$7pu;Wvo6ifP_bZ~)cQ zPcKOx{8wSRoW$=vFstR7#!0HIO|0bhNf7Y295%7b7Ss6qr7>2L`)16S}7KjUgEXT+wHfu z+i6)Xr}uF<$53O+OP;2zj7XB+r~TgkehC|Ypyw#QGEa>|X0OS6NO#M@^pWqp64r~HYoOgMN`QAl7fyNZjP9OH49n~IG89xw; z_xG7a`4NFoMdL+OPa-ys^yb9(u}uD)=YzW%?eNla{MY7c+T$9DBdE$I3ncn-qMLn3 zY}i~&uRkG-c}qnmk@-5-4dlKn4~j#azv9FU5#`=Bz@F~M#0qxDOV&h{N2yY=QNo!U z9u1LCZm;(7OFQ^W8y0~`HqG1ldFXO+yJMNl7ow36bWn&|VDZ*b>Xh)1-MQmdR5sJp z&z#m>`QUZ&{F|cPC{24yOmPHGVrDIS0j+CfRwkv`o-#t%%len=UtwMp2U|~sY580qaQ|A%bTFm)SNIsBUe$A-{AwzWWXm{pzX;(T&ScmRcW@2z1bNmZ7_l$-in^ zuRzso+gxE^4Q274V;m<=;+e_2%4OMQk*~q>XJ-VbSEfg1{v=SQgQoqVOw(&BqZt#q zQl(A$bL@VzG+GZd6IH638Fx(`5C4(0-$jnk>TCjDp_}7e8>*I|c$Q+d>p;VT#q1w( zMh8}GK;sj$^2FeEJa8HTmo{4(sXzQs_qZRYwWO8AJc=7}jcGLp+(npmG`nrxc zs0fOG$w5HS45Y}slUip0=*(b(u!7N7ZD=JP1~jac zSWp1t+urS%l${qMpXt=DL@X+U+bR;?r{o{h()c$T+ea=QV34D&Uzt>83$p;pu68%a zz5r`!gTDGH=*v-<_XU?ByH!dm6wCipxM_`gCe@Exs{i)D`gx!)EMoHq6R`oYp_nCy z_jTD-)-uD66{dPuA_F6};Jf01|D$P4@HzD1Db(cixdje9Wfs~bC7GvVAfovg=dQ0C z?>(*gSLyBFMKn-@JB|lm+P73Hp{AK$12fWn?2^C7XSy)BI#WBeY)=Mogefe&m#iIe zKTft6yO@s2P{uT$ohK|ko-1)a3+x6K@$r&{kSllaB!+oW$;a6C?Vg7cG9E6EDL;E& z@MZzYr^5)g{8B$+-Cy?sHXAby-R8A9(+64$fRl>}mexi(sRJ=sYw*_j@$Lan$gfdj zx7`QBKst?H3s8O%@`)@KDiA1l=%pK}A7y~!949WK&COvk3vJtehxvSe2FKMn-6$~p zb{+EtZGgV_YM3g!SRl7KBVb29llOat7GpG!w|};zlMC5p#j2oOHN!U4*xUyH8ukWn zg1E`#muoi~0j({Ar}*MxYKE~TZv%szI&b4Z<4#EAe;B|rYbZ_;(hUSbbnK|Cg zy-yra>JVS<4OCtYW0lgg7*>@N@N~RhzcpUoTvgNlHMIlzoS0 z0r1`c1qC%|hLHtYRBwS)*`6w+@E)R#Hfy+m`f~!bqK}UcH!rV0(TIbiV*wX5K!4_T zInO``QB2nuEA=F)J=MH;W-)#5ntHl8Y~_yVXF<% zltM^Is7jKViXzmzxsyA5gwX?g)a^OCEPdQqi8WKe0~3IVSe8t*w9;f5a+uP~glHN2gRg;%mE{&hnKJl0V#S z@!gE9y(O%7hmSdgfuJ#^lIwEX?P9NnMSb^-oD^ikp3!3zV*M4ac&X1xjrKD|ur*qa z)TIE#S$wl4?i9ijW#jcZSr5t3IAk33_Bf^#`~ZZik3g}j{37p`^E=N{?wI418rQ&ubjB)&B<&UlxRG3xrAh8RHzJB;v??I;tG2`D;mSO9sp?Iv*?o+v}XHpm- z!JU`*1W>nI9S;<4FE;c^fxbNFLx?bCG#8mL7|2Y=AppUV2#Xk@3MooLtobe2%u@Vr3?Mk2DgV%Nw{FHjb* zAq2Y)fO;@Ee~7yKN=5&fU4wz%ph(FmbayEm|AL1v#M)vK;F6)*h>uLj$Kta-*XE_{ zcr~Lro~tf!+rD8Dn|Zr-xJiU;eJ_cocGrsy>g&&+ zCHVnFIm~1VX(i>QLT$XAqI%1aQi635K}!;RvKZZdV3%&rX%Ku5#KR}aES>s3`ltT} zP$TevCGSxpW>0ok7S*EQXg`NUflmm`51=p{b9W_U8>XHnT~WI+99R;%Mb{Xo?R&b( z;WLGP5MI-C?Of3{vlrs)gfW-7ol2>!XQuPy{FIdpd|Z;=8J?K<@_{D(Nu44Dqu+Z5 zoP$dEkX0}&sO5wVd7zV#4ZH;J|INWS{0QgZL0JhFr#FpPtT#@=kb9>_1tKn&!xPK`0`H->2GT`zZ z=VaSDTrQhPAa%kn(K5VtTTg~CO{5y()KJ$Z|Eza`r$UAvf+n8{xbCM@pZa5s$0K{e zl4_FkCE8t~==AR((O~|8(~T2tTCDx`n2@VHl6{-|5^6%&4%@+m@pAf2zOPSS2tra@ z9+uB5&HedCB?qbZ(!+Zj*HlYCxYpCQ=x0kn&YcD379crP|Jb24PEXQ`2`k_rViN0p zzKe~EBRLbMulJ*7#{;DtdxR8LaA3r5#}5^s=nkS#SCY!lQV9z<;BYGdg_Sbee2pI+ zQk+Y|Fl+Y(v^1U0J7SA$pE}BE8+_fWF3cWk2QaM{hzI%l!dbBa_KUr{hD>V>1Oi4SPnHAztiDH z2PQTksLzVWBqTJEqYAmPb17E(x>+)_C_GTzsc1MIx!@$jn340ALqn zNp0+Wf{TQ3KgJW={w~fO_nWiXoo`Dpe9IDl^{O}G@s~=I1ODt^=*xo)09Uh4_H01a zw`RPDV4cHUaG^q4RNT;ygi7ON2|r~iLYn%D2kQ35{V;-MD1|iouOg@Gy8)L^;4REo ze#m9xHIwl^RGjjCG<&&?Ou3%#_kA1v{EF`@hTB|brB5%LN2p?0;mr!2r1et>VP-jE!1WL_~w$R&OG+5d5sVVb+}&z4fi_)=rfM-4yP zByEuVV}^^8FlPVvqt55|+I%Pe6iu;OR(*}va>dsU3h`D<_cn(TM-iQNm42h(Og%OhEKon{A#c#S5nmxf#85iXW>bl2cv zqxS`%+LDIS+`wuoY`=D}LA1}v$eCtGY? z-pLLEC;-Z*nMYcqkWo?-xh3RYxP$yIVX1wy}=MbS@;Ia!qQ}06Yxo zA#;rpbk@}dY&qRzAs39BVUoGHT`!3_ZC0t*9>=qVh{~!*!%mpvsHki{jfBEt_Zte` z@kgC$Z>I3Ke_uftrkheu@~~)5KzN&n!8`=tc2N}p12QrvjQZ;JI|%Lu2a`?z5zBz} zAdW5%DX*gc?2AMQoBV=*MqrqqypUP$Bx3NCycl)zcH{ZA>KYa}hcAIJt@mcw%gdl> zdQW(Acgw~bc>4PTevod*Ng$cyZ1}(jUqFGky2bCYL$?+d;Ur1E|B&3I$8N*vd}&mivHbG_*_2tpSI_8u)r>X!sDprm*T8za2Sq^HcZP(a76J6V)Ae>dujgB3tlAJ(4_fs( z^3a`^&TD{F_VGJ{@RJF3gFKYG2rrQ33hS9TGoyGr2rwk2w0HOT5yQD2)*kGN`$o0z zr?;(d30AO<#76EagVwrordrYl$IYO|1}23PN|W4IZ{IlEKCY;EoiG!=*k2H?IbCzG z{ciF;X-BGdoFsI*U(hcje}eHL*nU(qLC?Z|FKfx#AC8!e>W0hv3Vp4BhHh`e3dZV} zA)=z@@jwJ(X`|ot3unsZTuOUKSIVfI@r=@HxtN4EE<>hOO@h_hfiHm%9f4ld*M3rd zqPN+FNz&yJ_~ZDAvXD$FNPU9WmVK zxqhnMk%|VyU}e5sYUL4C1JU>-Iy1Dnu~xda!*jjD$@16PA}IK?V4+0e=-xIpGBYNq z2n1flY^OfcmK>{%x7X+~xM4GFQUrlQr(1L#4oH!(BzvY-o8$B`I5iXjwiy1X4fkhALGB=orLJNI!iyb16G}c2)1OIBT3}OKn-u;ud8o_^i)-4 z#Po&a`yV6<(;Hxw=(Bp#fM9Q+3-Ku8J{rS!2~@?gg(1ar3m!L=^2|F)(q{8baUDj( z=_g0ZlSe~Zaz&iI*@VS2!2)TmLM*~m8;g-MA@{Q1L*fz#B;w0x83cqJASk0f5VI>^ zU;*Hcp}a(GNm1Wcd_4A8%MHlxfWkckTKqKp>**iYQpJjiP_0Xw{jeJ%JyB;XMhm1$ zm<>!lwTni6Cj)rm;! z@N0H;7R=A>LYzbX84r-<<+q!F&w$PQtC*bcj{%m-WeYIRz5{4gCXKrrtB?TL# zy$&kMUM0ZF<}d8Y{rr4h5Wm?4J#VoUGI*l^eTLYKcz=BNCo{!vx6I6pF2dF*JWgS% z_<~|}syex+;NN`J;u95vd!-ATa@to!XY)z?LD|zQjBIpp`ehpi^W;EZzAXLqZr(aH zYDV#XcW?hsP8A5jO1;E_0)hSQ?#AQIBTU{4KF2TUw)XIV3*0p&RrJqTh!3{F6-4%w zjb0kVm+!+2#`m&weXsz`mWi&Rd}q=~ukNe<)4%ys0)1bZuS}e*6Fd0r~jg${(%&Kf0Tzx{$8u(r8}xT+rvZx4tiay+FaZmihS;QGk1OZR=k^ z?^-5^ePgG4Pw<+_8Y9Wv29|SSYfH<^9pzZtUpG?}G03nsadGjk`FTt|J-sHU6U@`o zQ#sBZU=-fy{qNi4ytF$?`?s{Dxo~bZ z3H<9YwBJuW{ho)4&SCY8Kh9eD3@Nyi@18+pPBXpB!?@delGe)3FAUG2?oW#K3aE<4 zQc}=;F$Dz_t7~g$lYOp~|NZae4iL)#w*&!?vu~Ug5Q%J=h$@vLhr*N&;|6R3xG1J2&YNLMG+ta0Qye{_k-A^|M%Ed|~NBle=H{1PiCX-nLh+ve_J2}itplE}X0m%ex zV89dQaDmC;pCAyp%=&iCAfYr}vFmh~*qcjgXj%}1V7QwJHs2D%qbR4H?J|_nc;~9> z%|MP1#oVzG2yNWs3UYbMcjU_<%#b<3C3Pl4GV&z85HQ4qGU{yuZH#jVS!}4zF~BI{;lxf;JVo z4KF$&MOB+i_<_<*H8N0EM zIga&JyV>n4V}kDW00|x=N#C%Y zz78>dp?c^<25q%?NM+0X{3}3JO-fO5{5yt-_y^!3h1_=w_<&%H;U7tR0JK2zYaXDF z25UY16c}cFb0LT}xIdU7X#xSfn=l%%)Rv;5D^4O^T-3OuNRZp-aC;t%--$`i8mf5Ftkk0%u9;y@S@u+OxQU{d^arvJyylzF>u}$dPa7ydKX9hVH6ea;E97faaTWsm z+H9r4nl#h!I(7bFGMmCd>kz};-91+-jx2Ioyd_e4#fa?ba`Nl zl);eLEPU}RKE%#wMGF?13?}AQbwV;m8?@~ZH|kpknFB8|U*et>AYu7CAlS?mCYTd=2+MhrU7#5~F=t zMBI?x@nuLy=C$=V>Z|&Z0z*$<<4V#$a8{KcIBri7hcbk3kKf4V`5DS0eHA{x9hy!d z7p7tsm3&$ta0*g0xDjDtdjP*S;$6(LGU_}?tgpi0V5G{ML_V*7W@T2XiTGWaE20v1 z`b7+Dz$v5UzcBFq_-_l@e!eGOi^;~-Q<0S1B07jn;~ynnbr)f5i9#FRROZ{o#lEYc z7ejyvq=DP1%^;abmm|2fR?#U|bs~D&rgrBN5I|tAW;cZkYJY)sju!pmNJy4;zqr`X z7=&<*G)zX(@6mD(d5v)G-~V6|E&_!_?D{GkAZ@&q!Qk))Hp#$eT2!4K7Rvu)Tf^S+ zG4j9dVU1$2t)ZR&t1cokvTrzn%KrJ*JXGAdySqCyI$G^Sg4`4>YymLpBV9^m-F5{6 z;o)L6nWvXm9L+aMnqh4}Fv!`77(5)mM<3={lAgY5-d~Y31AW!@b|y^!?E`EyAS6VE z{kXoyK*fw&mwiJfyfR)Zr(McZQ-;ESqUxjXyXv+idA#{WkHX(njTXN9$S8DaECZ0z zr!i$;n!4wJG78eSj-N$3o*(voVoiWgr-o-|Z-NW@4gX6eArG-p4_T*cZ;>7aG3iz> zG(%C@>{!DMNYmYEUfo$|1;FF!Tn8o023|Iv7VOF`r>d$-oMyv9MA10v1A1P(l@fZ# z`D}R$(2@BPWk9>F5xZ6T1y$F!9II1 zsYt!VlV26GLV*E`e{$gBB>J-diB>dvOAC zX-2DbEyk=xqV=dYct}$$ z;&12JCu~0$7h`#|K zsqXamf4iyub2r%@Hs#?$j7i`#31EkIUi71O-*`0j%zXFZX_eM~yE?3Q?yFY+K^JMh z6@DJ!iHx3$8`?r)HPx1pU=cvROggkf9nS63c8w#tjzb8UvF&p{w9KBcd-L8lXg9L! zi*H>cQ-i*B>s+c|x*V8jzFkvByC4~kB#^ajAz*$M_mOkaQ+wgZTorU(M; zM~f<3=U7ja&0W|14Uhj@%sL71Z4TWzT6rr7bTNt!rfh|yB)zTGXLr_xS58}0jEv)n zqG*}56V-B&(>l7pUeP?fD8(#+h??!un*rb#4N~hey(I>6l!s6wkQ& z{-wj^?M|CmB%#5`BEK^E%M=QWYt257zup%z^+qeZ@h#G)Tbw68(wMX9$xhDpGOO6@ zStku%-T|8|POmY8-o9!a;e8U|-r6h@_YE^9vpE4^L8T@;!{I#QFd%kC6AHL@4b#{Q zU?-xX6K9MJR0n#>mp$~3#eT_=6pDZx{ZkfAPDlofIKm6nyR=>gOp zPF8MM>VLdZKdQ$n-QKcN-I7vuTxAQp#wQOyp|K2Q$ueqS3 zAggxnGP+RAyGYqHO-=6b^;L!z1eyOnHscYnkzDwoGP^k02YLi>K8tFpMm#3G9C3lm0| z65TOxM>$+g!PC``{Uhx^8jkF0mAM=gP-W7?pb27O@Zo(s^-Qt#nv~pyfZC@Ce8<{V z&_)6XccBY$GmIp6VQolKd9G*ZA=Qu@-cLLj%8ihB*ZLa<(l_G*G~JtKQuUXz(6CV9 z#7ZPji)2=wjCq9R9^>1lMSC-w>2FX8F(uw0l+klLz2*M}w_;IX6MY{#RPF>O1)+r{ zuCA`)NbT?JjFTB_qAq}m7`1XOiI0?%hmn+`BB~Pg%89;eO=kvsB;Z1LdI$lMDdPz; z*8hSI46vU#Nczs60Twe5Y+KKMz8td)V7XV4s3z3MixCaPzJkeR46x!8x z?KV`{Io0*GR>onsvVa((XXxBx;r$}pcwR3u%TOX^Meef=)Q32i^PrZ%FJe3hV>cDp z88TQ>?cL~*=xc8c1@!!ak}%4q3qsbIf{aW_hNx-zVf8D#dPC5%cSz%A;;bZ|N6g8w zx~UY2%`}iw`4YUe+%ZH(##iL`^@#?77#V>3VEB21gP=~*JSu3yz`+HSm+yZ(nBMO{ z*X4Z3VN^1YanvC0;PbmG$wX59it6n}>j*Chpi4;q0tF?T-f29D2+gqf_gfJu;Pl5E z^>TakZFeLIu3W3$Xj_A7*INOI%j2Du|L;TgQW3a)-OHSRS0!3;J1-@++$;KxJ0n;B zq9eL*4{8hgl|%@?F_Do2Uug>qR$^W$%PX?0Z`?0b1)l%wxt!Xe`Xmlo!(;CX)fXN% z4MSB&tZ>hHrY4DAc2wP3aeSz++Y`G`Rixzy@J=%%->Va+`Nclk!#3;bP@LYtkSoX? zi6KEW^QEBDzl?9rmLQa^t)dVjM=!K)^j>5M)ijJnCI+oh==-%g6Pp~{B~)&ZiaEQU z3;ejY|Ahl=9Nm9L+W(z3Z(?FN{-il(dwMa^n8}BW4KXRM(Cv#0B&?K02nhG49-yNb z$DqymdOgbY{A;0Q5ZCsV6duOXY-$N7S$asLhdI;Aw_XR19!d2L=-b{@aNL)WG$J z7f$@39_-5#YHYeQPzm=Z(52QHa{CHTVn4-SpQX7Sj@9iwHY0RH-I0Wo*9m7$!<1>nEhhaQA1VF5HqRUZ_~iL2^C=vgHz;Y8 z;cQ0t-uZD%LdaEeBEMD_Qj-zpv$rF2^EehN{s;W2+D7vhz&aaOXNeBT_+MW?%eq|w zo7rL%=Zkd&G4^OODIb|YuZNQbB!Jz|d<}2B2ICeE%s_1e+RsfM4~%_i;9hW&_$LuM z(MBC3+c}}h0SV7B4rX#(ujFSEl{mhyVV<0a z-y~V4GE3^q`B6h18_7`F;jH&q%5`IP6>hlwrWAEZZo|tn%eFcV33`&ph|j__GL3Xo ztl}uybW$n;tdr@XUvMc=rg|iY`r)8YH$>E3cvrOFRlsN#Y{Ji|^zi4uVJaW^&HVxr z1O)EY&nr687Z=Eg)xe_#IIHOgjPb z3JvZhizuR-F=wF3A_oUYm3BU48U4-TM*@A&#Ov_#rfGVr!E}4?EZdRy@MzhQe4~4& z#m7^DSfEgL>_Ezy7%D?lQ1c2n@Ow5to`IhSv{)Q3Ef#XxHY%yG9|Q<0Amfa=*cuKS zN@4(@2O(QQ`4)OGI*=x8TT z)c?N^6nw|u-UakX5|MV01dbF-Yk>{Y=6hlH)GVoeh3_R)Fa-Pyy#`AJO$zBV51$E3 zB4(H4c)e+m$Alnje|KnuhhZg)6|4fvu~76mi%}xBYeVXjU>)=OC}Q7o+e3ES5YCjj zR;MPgN$!y z;HPI8@%#GvvKe>5R8VkeDEuVqaRjyykAJsNl8Mn-q}iUBn85fc2yl5zo&Gso(HeF} z>}5MIy_UoOexHh%P(+U{BlMPbBVg7?*&lq#+_RlxW}_4!*q~du$#O@A0p0I?_=1&6 zzfz*CXoLJ|7g>%&nrHyJ(tkp6x|4=3;7OX5OI&`>ug+7Fy`? zNdHXPpr-S^AJIeYm(@VWBta$DSyTJYIVep#6l{5(YuBm}$?QPNE}$}~4sGUeEI&f6 zCH_}rB3>Q>H0-AZpdan4)Ne(;q96huK&4@)5V!4|(YT&kfph@iY%#gm9(i}WHY!8< zufYuaU+%tUcbjHCLy0YWb%XU-PO-T^R->my%J88_GRG@4Rj%34bz=jQYL z6Hb=d78lz@JlTT~@_n)N^rLqD&2eyo{Kc=V*B+O;t+b{Z6bp;5#WNnXV*-hz#Jxh` zg}e#KZM495LJ}Jhd@7ksbbQwYU8KUHh6f8ZuGBZPV5!$0pB95Gha64!_7QUsOwpU4 zm4LOYEkksu*&*06rcf@LqOKlClLeG)q!)1K1t#HM{}dkVfzNIRz0fX_#|Pp~;Cdr* zrT#Oj{86(^d&|JUFgN$je_HNi*})EQtjNL9z&-HIMho^nfksIzwqGIwOXb^6!e-dN zcGUU*{d{8g-S82J@bSadc36zAe~LG`UXuVJ9hfddvCPCniHrf;DEs8zGvx>i{tbxY z&BkZ^FImXIJlA7d3bYgpB$J95>41 z58oSFA3`A*6`HF7caHNK*rf*I0EMeVXzmIPQhN970``|m*F#=&w#?F5X_twb0^ zs-e6xI%8JWnVpC|L|@>F0z>ELBri?FpU@u_AJAmzyZID!gD2V^__?T#z68fD-+i&v zZ+$O7EgybFuk9JlI|Cp8!0SbMhXRZI(syD(_q1zn`R$1N!M zejUOC2OIj(1aIAs?iJw;@_TE2E)Zn=ZUwAy)j(WiKyO?2Q&BamZX9zj2|4N0nbZX} zAH1=lqHG{bFaCrivU>gF%!|s7ZaobqItj}>?pN30?9$Ax)~rs+^UbGCgpDwJ3WqHf zUIV{~E`~dqj4oBM+lpU!&o|*Wlwf{8fAO({VEvz+DuBM+;sXtz5 zXA>@fL)`oKrqlh{WV&zU=6)!t$xA~qL|$(i5qrE{{DawGq6`&6MQJD7eo%MbS%kD? z3A*rouP_0j;yVfmo29ljbLGKND?o4sHIy6;~<8UD@q9ASV{!twKBd{5gc#g7Fn z&A&u#DhX-xeB1fc7yaTPJ0jb<)b7wo*pg8%om*+Dphg>@oUE_nGX60v%0a+ew*E0u zzXy1w22xmZrW7o{Q^BT16l2-30b2lp5ft5I^lZ4*{e+Uq?Dh_rIb{b=VPu4?r)ab4 zGq!xs94EzAPuIG&8$V@D%D_#OxdA)TBWwuSOkjH4Ln8NH>Z8kCt;Gwwlny^2O~K}E zDD1!Fe+gf;(oXY!!eMS6V=eJL=DDg_Nu;xFU4C18ji#;qI`7j1B#@WX&@lX$i?eIM z_-zs_eNQ8onP3%En$Kq)>v)pT1&!7!rHYffAlOCKtWxU}9s4YUmZErQ+zdI!a{b9P z?zJVOY0?No3b}u_06Y>>cUMwbjd{OfRIuYija|6Nu>z&b;ztF_FXp`cjU2tVZ^0@d zgZVsw*SAyg##t=gvV?Aj2hJ(Ryk!^CPS) z+l0prBQ2=0)G8cgT;Zt?gWYy>r7;mee4Bhf%u^zmEOANCagG$-m!2=$bVj-68#PX5 zu&_uBMlf12_oGL!HurO_u+bmPhWCXl|2HeuQ+$Jcz0#acf%vgr!o`x-r=`9a;`O}C zS7+_bqg24>`M%+?H4&0H#-rVwaLf)Ge4W7qK8)Fdp4}EKgq8OWz0))={pmUTPB#MJaOtY%<+oIfT^P+Z$jlF)wz; zW{1PLvqrN8Bfb9a%Ygby50gOsfn+RNl|&srw_vL04}j{i^jPmXfqcADpWk+=Nu^AK z?f*YQ`GH}~3arQkdzRccY26P0#Oz$sHS8Fga5SX8ihjD3WXu{!f&kG}PDxm^ zsDFLcQg0tQ*@-=Cqqk-@nC_}w?-!)GVP!g1>qvoou%ZQ?^G1Fmj>#67D9Vt{a4M8D zz3OqS&NVaH2FHZdb>~=MQp2dfmuY^dB-p1OF%8l*i+H*KZ4E$Y)6E`!>OjOgY)`@#M9t8nAXORfL6Wtk> zUIQk3yR`Ko55AyWni%r^VR9L268_zNCM3kk`Am^p%ZKU-~US$QqwU z&&U=9v%fTU&FdeWGW7yodil6~YWKb89sgf8wK{5*JXq3Phet_`)p%nP}%jHt@^Q&z}ozPXIBN+ns$y(y~mY=Z%87@p|8T-J| zjfYE7Nsw=SG?5x3N|p}|b~9M|i}fK_x$i|*1e)u{SZzgxI$jPUl5 z_%Y)ulVAI`?#o}_#>d;&q!4?_(7K%{I4dsZL!zK4uBx)w?roak_u?+UO_)?wJ zC;{%iwx~7n2Ls6D6(l}0c$}qy`n|a`Xm?@M@Ye)D(Qa^qtPZO(IdM9yaT+BK5EZo* zK!X;a=p>D7_>&vl*0mh|zH*oO?J<`g91g>EdrK9D{f^5afQqW}UbBcoKoFCNU4)O;%Yw9>0Gqv3UcYXr>j|>Vh#3%p;xRVtrU|r>D zvP(Xo@sH+n-6x~bAJFwvivIdZ+VT7Ec5ipvg}T9l9!FpoM2*{#t`&V?!SO;p!%!k! z&=OX_h9r+l2qJdic)5lIQ6v(lj?3TAuivuTFAKO1uQ%ktymq80+BfcIklj7&e12s4 zRQUe~1p*)up9Pxl2k((43(gZUGEp*E5mG7*etXYM(`)qffV!neyJn13e<`ouJod-n zad{GaQe_PkqO!O;Oy&DX>9jMDtKlA(Sb06Z*5s???yCkE2e2s98XhS1Rho=mW95sy z;m?KpY7QDay^2FlW{ap#!sf2hMP+H(#aV|10Ouc?A*BAsOr30D@# zYlr&{A6M?KK{rYD9H%wfbsWJP5l&E7`T`fzzOb#u`i(G5P8dL}22e)5G(JlmmP47X zt_PYxloy2L1jy+cZH>)F4=(3w-!3YJJ^?wOs)@nXY#Kc^WY4OFvapV(zw;!J!bNaY z=+7n4|BTVfieo!6+L})k1In4i8HSadm;}-{nM7j`-T<2YU$gmYrvw=%CvqzqN!Rdf z6`P5X|BZ^6VVwF1UR47OdUnLQ-|3MG8Q?z zmIf#|LwbA3^@mZG%!c9lj?_gc3ax6ZfCP^kH7l0{Fz{zUTa>-H{D7(6<7%CX1UTEN zk>rJr?Jam5X6U1O4}@ zMt!S!w;oxwKVW8BVW!Stu{mNH^a;rqy_~WpmwRJi`%n&a0%}D`aB)RK66toFHnfrZudzyr)pA*~B=l0w@kx>VOPPO1O6a#w#d=|;w za|U?6$ALKtnt`dI^6l*L{B|tz-x&Jgf5y;lDG^BlgXe3UK81|e{$8i;-W#OSYsb-w zlv1ZBS^V#mN+C5SmNQ>ThLRX_Ph$O>~T3-r2Ggk8v|VVH)ZO5Unc6$&jIcC2sSA~ijqf=J?!)1_%?z*){a zzXC4zCc)VHHXlbab@HR@q+_o)=baCQy^y`pNaF$t>$0cZyE|tplKVc*HM}(#|64&P zYaXAU!C}=#r}te+5FNdfXI-oMSyuSwTimzU=H$5Qw_3Mdi~W~9XO`gT3p{C}*4Vq@ zwn{7?tWBhx{Y!7$3X6ieASBhSW@anw-=vKAl!N<#d&8{$Pn-73^9BEVzP}2@h$4G@ z0S90#X9OA%?^mqX`cPBypXAIp&6F>S0q*A~4xm5(u&f|wja*=y{t|V zy2En)DhRYY|FZo0(DLgm!lUe@UOpTer4v`=<_?m?o$jJ`_n&DEKHQ#N;l@vL;d>Cy zYG^#R=$kZ-0;vuGgPWwhL=JRO`Uu;~7L5K{zsJy+uIA3?WFP#493KONxh9`*$`op) zPe{)^Crc8=LXUMboRzvYc~}z%`|I)uJJ~P2KCknW0Pt@-IM0`Gs#0c>yhL45ke3n$ zjYO~ys4{0~zbzn>_4(4<_ z`xIOVS*5p#knO5#YkP7#8^G8=O_v_W=OekE{qnIAgq>^Q~SG8TPe;@Um-T=wU zor~1eR)W`UjnxG9v3Tg-^tHuaPtVy{Z9*IB%Y%NAb`gRcJ=b~RBtA@)Y9O6akxik{ z6wt%mHUP=p@Pe}LEj4qVxtKHXve{ZKE;tIx8h1~EJQY#=yZt0Ep=F5th%?(^`V z{N0I4#%ql<;?r`QDekC2+gCQn4}*d4rFC>1Xlw&gy5e`4pZ}y{@QDsKu=l4e5pT!2rme!T5jvr zj$rL{y{;@}cKrK?zfNjQk(f2~@LH{>mTvfSw%~ZKx$~0uDf;cjHrtl+;RhrboMWUP z(_wM+OwQ(YV$)zZ+U!M5+2i8LSb{FFiNmciUNjIQR2CBo+dCH@7%3fneI-jxBG*jw;{Ik=+(XfM$ z2UVT22UrbC(izcm=aK+Cad9I2KN7GFacg#+0g+uf~g4=`FHHuGZ@Pl(Iy4kb^yb+k(WpDm?wS5oUpVlLdYKW6N_b zHjM>>Oy`!rGT0Fcq4L*M=ROM$61-ljjqozK@yhq@{)ST(1}JfHDv)|cu!Zj7&!8Emq2)#3L~$9=8g)qBj0v#U1(l5pywws{~x z53z~qKFfq()5@U$UFti}^-jFeQTssdt;_JRZ4Rd;C1>^8Ul_hSG~p-w6w$A8m!E;v zB(p=<7AL_b)5as6#j;rg8qCLAh$*Q?K4q6>9_^x)62Eg{Gk2;&#M5fik^S%70QJfp z(>5%?-+d1MHx&ICf^G=;-WdTqXT7{}aR00q^{MyNq+0ZS!9Tmbuy8qAzy?qcrPG!y zzIE*-2l-T1(K}Ei5vL_Ow?#69 z07)lEG{64^%urDi48}L=?Ex*4v!BUR)8oP!7W1f}uLO z&5ZEcG(%Lm6Gs@=V&zG^*2U zq5kgs65oMKL9O_6LakW6g|Bvl1s)-4lFM4oS;NXQ_#Ft}*I^MyugwWXZnm=f-DF$? zrI=6$78DeEN2lNO+zt&VM}fPmsaBUKBUxbEKfkIr?<_1>6^ma{T$+Ob?((%6Z3TyB1vw7GX$FoX5 zd&vP7+ke>dZ=B^MxmSJ*2VWQ6_R$&*e7E$e76Ga};b`Wf3t&oqh`ij7=pwhrVmknx z(oCz82Q!q+by)H#4lFTVuEPN+o5lR#^NE4`l3BKaXHAL#;s&*hvO8XHsg6G8p! zZ;sxzPxAUbcnsK3tmGDO!b*Sm-@g7o=uo96_z@#f2gGcjM27V_;5w7nG^NjCYuYJ=hD=Khgyo`kPWXLl^VOrOOv zRVVAzto+Dg5TW4TOZ^Qq_ze2mcLQU=m%>R;8Vu#~KYtSAD^bCS%jEFp-lT~#&qZU* zYY4eIpV9ZOb{WwvZ257P(1H;7Fgaajy+dW{v5PoUy= zf{yw07yt+A*BrGnI?5(#D7pAatwFNjE|-L66uhF;F@aYrtHNBh|7<%^8BYNYk6CtQ z!-J~~wxRs8(ai+E*=K&J(yD#vJ5ehx+T>&CeLfo>qq2{Bd9bNIZOB^lOHj@gALM zq(WaE*1A-uYHaD&C8c7Qb!yFi0u&zApLIYIC(A?0LWc;fEVA)3jm@1WK2z%+` zmWzt1?+GPjWb8#xho?Rs;wpgpME)8QONW99INaZ@&-4GG2r+-^wfK3ciKo00QWqEC z>HX#mm__kcB_Vt~^jn{UX2ni-th!0RLYt=;tRg>}XPwqN^FvndKB$*4SJJHs_>vPv zJogpU5Xs6D4F~-EkK}XrMa#_h`S15MgC+-2+1O%xlhz`Ij2C7T;nS2KcczEn0yzND zxbfoujtDn3%^U0hFx(%h@Q}$5+6Uav;V2&(trWmT3JLGpTV3Mo>?D|BP(MX670!Saf-;g7J}X)_zQwI`TD{EQ8~3f7EQChtC}gdj*dzFJr9HlSJ$cWfOb0 zJIN6=iQS2!H;XOaJch;4%Ex0kr!c2%TmAZrWXQg*k*%)AEm^T$!D`dROBZ<4T|9bD$aw={tRTqjCYCEIF1E=kdzL1`aU?s5sa8c%Oq zmxi{J-}hJ2#lV@0t>PYZL_|Z&TxdlC2bk>my)9sa;g;~Dvy4B(_^vkZ&d&#iSu-S| zc<8RsHsR&N;4ZoNu{GO}_$A7vIVVSSAR*z^^nq@ynAbiObK#94O&t!=I1$doc=)%7) z-pr!v-R_VwW^%*+OoUfN&pBwYmB_Nzh#TUd3NG&zlTCELdw5?M_w5Xq{xV^)-jZ43 zDx$&t@?A1i6+1YAF5-)i#?Jj~iAhA0`@@wKR$xfj<5S6MZ{a4VV)NW|po{Q=myyX7 z@)sJ_+kI6mvqhJMOJ_(|X)rIU2wrr5sCjoyA|!V3NpSqD8`Bq3ZY(j}mc`Gai26e* zbWyNsG#i`E7kKjN6GgP^xm^LmJPCSXjg(R#|IQi4OqM-OEvh3MdoNm}A=N+(d6paw zHLqBams3A!jUDK#*pTUeo0W{z~8uYfQ6RT%qwe_ zxx1?zhP2Rl0OyRO}k6pVa;eeGr|`?qs;DV;Rf9m z9*)<2p?LDd2=#2$uo!lBdezt96n7bidxZ8Q39bcmt8{z1BAEifcY~f6rlGu^Fl?ed zMhQis5~X$3Plc?ukvK6?tT?DWD;&v1(yyp6`w__25P?Hr1XK6IT~-)Pj!Ek+5ea%> zwYnX30a7)}crYdQ#J`#kCj(BdGR?PulH&hXILUwhEFCdpv&*_? z+)RGV@p0g@#>Ea9F(~vRrQ$QcjIfL4yq*&s*t#N|b-vD*2B$)=Q2n>o@3{oYsDgp* z(p0IA+2@i!&o=2k^fpy0g@{&fVpL}el!4y>Nwy=a%mq^=;$E)$gG zy*E|Fb?15zc*gP3c5UHs-JTLZSH|*7|1KpJRB!Bdmh@UkXxx9WH$jd?qq~}mG58s29 z)#EY;`IV0a=!VTYYRZDt4bGx7Q!}Y|DZL z9{>)_#%oQ1lQ#Y~U?&kRmx;;*!(JOd-wdqqXM`d7=g(!-Bd@)7&Rxtu>nuzu;X zoD>jpS`S|CJ|wf-IMn(6^^MizKEr(B$m%-5{$)q*%VXJ6+-?ElnNU zy`ENuA|CuA?D1eCNaIj#n`lZL5u z;@;UU$c5vEJOk1l8cOQ7PTMOdAky=EEJ^UxZ!8N~5=D0@(vXcAqg)<(fclM^M|&>s8S}H*p3{rqnPK3B-ht z=R`8so{Q;8j?5BR*QD0qija?v)Dd^g8tv3xamYc?D-v#F_ULQ@{2g&3tb7|y5PpaA z^K1EZE~(s+IU3=sBb1{LQn1&7UxiyPSa|vQa;-OtsJ-!Gg_THb4VsZ(puPTO-;HfD z5~stJTDIc{3qEK#viNxSTX=6@jg4AHVzsXJ2W3duP1m&mRrQxiarUkSl224#T>uzA zx)K?+ggt~*DjTYflR7a$tRxIEdXPkU(^HyhsUuvR!62Z|G~pjY>hsoQb?;L^OYf0N zNo*FDt@GhWcijfa7GQkt8KwM;mhHmr=q(PfswH|_tYM@u>JleUrxgOWY#3ZW8j-cF! zqs7K1HlR@tdrA4vx_C_yJNFNp(h78W4+< zRDpV=ScwtZeTA&kvY&FisjQT6`4*$e;q*(55Osc-{Pg$}IGx32nx-R8 zEqA+`Mt>vny1V-D^T*~ef?nH+qCCtQ611+83@`NQGn%j4ZN^rz%32Gr0bHhQWDp~y zc3%40Yb}l~^*h6>>qKeoZ;r*TzYi-br8hpwL`^X49h!O6vX!dTIr2wvZwI@xLoT1q*0uc)n`mCtI`%M|o{9m%%3A2n;}{=m zz7k?`vCvUP??u3yC@Cq)EZXl+m7o(;wczA_*Q&Erp^IAvg523Y{=(DO-?e3twgH{_ z+?*QAB)lYzPZIQa1s~V8w+pHkNWUh}Rf%BFnk`X-OJg^m^dveyb$+3oZ?9ASTOcPQ z2J7RU`z9+x#Ng5X4kLN2274$F~emtdEQODAu!G zH^$=r@~ig~os{n5qKe040RbTTDNwov^HbDoibR@Y{x)BRr9Gx*L|4Mqp)-o6`&IG< zmyH;3wNsD|BkW^_En zp4`q9chcq4Vm>ZOPlA~Wu100lbB`ItA19SA{*{Xh5b=Y=L5WVY`DgDvKln!nBvGD5 z%ReV!xv)zN$Uz}N{M5DxKfb%G2&YdGjU29nt&rTaPvi!vrtt`J2SaWa&D#ah@h`DO z8ymor*IYzxrD}<;5-=fsM!UMgFnChy`ssGoXh`xW()MMt`&fZkd#wK_3qZB42BaD1 zqYoI?e6F*#MuF}>^kq~uzM;FsY3WI#*NB&%1`NIjS?ZsFZA6yiUjrayhJYU27}+!@ zld_@(XeVVM^~@ay(`7VNGQ;v!qE5y@bR2LeJ$t{rqW?&V$;jXQ7o(#%XVjDa>1Z)m zYbP3Ge(}2+91)-28wwf~^?Z^-+rfBkPE-0ByZH#%VXfUY(MDz_LY^*GFy}@JS4M-; z#E`~WS(MlD+A%ZLPUMMtY!F!13C7qXwfc);@~`x8K*}NREW1Ec7pD=o)cLzbcjAtd zYBBP}aK$-I9ikt>>nDzI+e09ma)WY|&XcgoX?B3|WWU@IWFQ{4$7BW0FkRrZq9nG+ zy=`+D;fqjTA+w3NU#jz=L);uDbkCkb>*|cv^pxi;&zy`cs)W);NV<5SgCAEYhE9mP z+DXaz20j?4xur#RH;b6U2=aa&$KAnRsCyqLL1^fQ7utJYNAr+!<(LrY=*<$IgfR7y zQQVdr)m2J;mF6ZZ2Y08hLfufi)uLy^9hCZ{Mb;hynk~2vNO;RyREWgxrF(D`#7_?f z9x9l`=VeU$^0@)Hn97+{b?5B1_M^~7;Ic56t&f?#7)G~OeEtV~MgV=2cY0^o`84o5cD(B}oxQ-Z zEwwKCnuzbqyLxL36<_?v(1WLsDXB6zUN+Vz+b94^$9{+Udn}BZfq=uMnz9PiIkw8S z;dp*He={v9EZpWxI9a4f6<2;f3Oh)Ejwc=0(~dD9bXgSa0f;i<25w7CCF2ge)xdGB zL+&Zjix|fIFFk!42k9c+>CY+i*-+tFDrf3nsh#M}wC>U+>k@I~`g}>}Lp zsj|!JRAx^J+hKVfdrqVi=&>v8g)0&;I(ivAop$zl9+;1|_BS*^;1N!n+G zTs(9Pd4Mb_-(L_Sd%ShUf?Eq

GFL;y8=MmI}t;6LetXJ^xz}Q7OY4;ln@rjHlk1 z7G{dQ1!rjL$igP;a}LP!Z(mB~%e2kC-eJGg1Ok`nuXOm> z%Ge9<^JVk7B5IVX=h9pXV~H>_F#$qWDylQNH1@*89;Ld-XR{Sm_5Qa*H%CVmc>1vc zc`N|-{1$C8Y2*K;{Z`@LtP`wdRI$*j}~Bpz9*5(mICO;5?YZj>!>HP_{peNV?emdh_h3v3Es5)rJY`1_x?s%BIi; zP;nHm4GjJUU}0eaU|V9T7ZY9c_g|7ePgM@aPY)VmYd>cIJF2@f#3_5mo|=;VvcU)hVViDwJ3{jKOzCM zmKNco&tN$|5Tx^T#0to0<tMJ$RnyDb~|}({4bv zrG;+1=zDm4aVq~bk=qlvGeR$>kxVF*xIlA`Sr}`g078yxfYiHokgIER`BBL}BpI71 zmM0&@1l$b*osu+====poSLn%*Hp>g^d7+0=-Pfm-pev(Vtp|6_+p#$$RlS+8Lriu4 zSlx9qQwi_uO)T2j$bf2Wj1Ys^U1;Wc$r%(6qr3hhtaIZ)zBVYOiS7L!)AhbWWyAk! zASdm#ptC~e>>f+;y24nCnfx49A~S4UXp(Pr6~2s7$LRWl%-q&h;Ez5ms`NKmVpaO0 zJtKtBtJ`4JWUBnFTdZ;&kM3E<8L*~CG{ZXLf>~dJq?s<)RM9!I(l!T+-A_r4c z%WXPSq05Ta8-b;Hpem_OEa*xgr&*Om0`Eelzqee#;JYzn_n^Cr`%9-5nXvIO>Bm^t z-f!hfb%aa22Y%)ZR-XRAJvEgM^+A{on-3sa{2xbquY=3m?oUBLI5Ff@hb*KOPSvw0 zk!uZreCL|&g}G1WN>nLy28Qem8wv2~UQy7cG#Y|y4wYFAeASgcmFk)%J=_Nlc0PE#Uhb7J${<$OlJR0K=2D9roS*{7kTu0hXV~); z0<^^q2deu08FJXa$@n?-b!Yb|17Piz6wS$cu_QQ&0!4!2*u}+#6Ikx{PL!b0>i0XF zneu>&3TfJ0-)0~^3Gm^;0TIo={>$;*@jW>iJgFT!nFp`GO|6h_Ij=TRE*^%e-RUO? zjQ`~DjLd^TRw_X5JpbS4pzkXXG*qIqBUKn44fEdJ=-?yxQizgDpDeVYP+HO@H(p$b7Y8AKf zpf}^jLh-1B5^)E$2QFuVeh-)U$3i}kiC!MZwezheo`cQDfoO@&UXEfpgj+LLnX`+v zMfhI}ASmmfrUe+ek)#R4?)0V~(FH_eD+c1HSy>3SQyOy?ENI07_0=~bWXvnGOyBi> z9-7!{wdW;GkI3jO4kV8bqzu06z{Mepz<}+;W+2oq9(dR45@M87Lz~y(C+Z?X#`s6i zcxyWv4`t=U{E0zkC#4g*xJvsXk=`d>18`;C0BjA=`p|%U$%6&`{KwOP zx8r%nzDaI`4R-xYRB{>(g!$xL`WwWHX{@r^geyY>mpUgBUD+SorBn{nW3_d$N{TypjVGsp%E947iM?0kc#sUq-_K>0n3_LA8# zGJ)Zna&OI4pKD99zdz}zYotlz4!LU&TIrqB<4%swt{#)cK*oQ9lOu$4{Zq(o-l;~w zPi69UW&Ss)#{PXyVQmESfTwP2S-r>!9>E1cHmW|;}5^*3CJ@wLzb=NJ;fz?hgAfThh8_V1w7 z%2l>1NC2|Vz5yuCWc78 z)P!Z6Hgm5i;I4Qg_~-8`pPb)reEih~H_19vI909-+kJl>lu@>yN!D_6&tePvPK@4o z(+jYjij_Bmdk%WTGG@lr=(`767Sr=vqd z)I<D z+H^f*3L5*f+dTr{Vsci8XYG5udw=xKr$PS`oem|b>BczJot&zC$>J6a7<&?I$u}*- zX_9h6`|}P}QdwE~4WT~6P%>lqEl`nWaCn_KV!6@OdxrK;tiqo4SCUAn5HTpCd%9kik9Ow?vi!gi4t1 zO%}$7){d*r)Ca2sfKQVOFbbkJf*4CpxZ&>H$1i$P56ii;FEWm`vzz7KK7fGp>)}9fPf6DrDo={Q?@XH6Rzm!^MIjwAK?Tx3_&i z>J{!`{w}D)9RG=ml@Z65Kf3$!*(Pih(RF=u@rMwU&mU5=JVR3Q)KRA0Z}HE4%KAq3 zMd?Gw*jI)Xeh#?vnI0c5(DX%AE<>O<*gP1Dii+`}j=-r=skhch%tdlCl~Vhr8$J$; zC36GmIjhufRW4Q})+j>Q7ne4ywiwG+HE9Dl22%i1pwG{jpmYLOngQ{5pG_vqV`ooi z>VWV_E^;#ue$4(*(*cqf6$mdpbYNtqzc;?M?+ovtcVU6xgH{zpQPvDc7OldD%f9fM zP=9}0qeM?frco2ZY1raMjA=14A=CGx`www9+@<(mS!}92MLh-t9d?U|ISmc7V1qI8 zZinz?O^%OoDQn7Gvta|1B^NmBy!yq~AJ{E^K~aWry0Z#qVde!a&Q!z6QvTw?93X?& z7?PMMs1!3>U7ANBZR6-9Tpncfzq1Ee<8OJUi<|q{T%Kw@B_X}ilvN3TV$1>pFR{%| zE@I-E{(|4+_~xmAtC*~x#-hQF6^SNv58-)xPaBc`STEkFy~Q;GuHe1f*Em;BK@1YF z{rTClF;YTgOfB#FD;|`!c?%$&TYiv08zuGO`=mh`_%~kERF52q0&b3AnV9X-flaHR zaULzBjG|X@eTwj24)D1wqeEyF_ccjOdJzp_c0&ba-w985J_e8C<2Jg*U5+|0W;Z#{ zi}Jz}+}4-V@kiae;PJqXx*qgr9XOF;T=-UK!Lk|p4T$rN=%e&IVPiDi&NR~GG531& zCu8gu@;z>xd^#<`gxC2lAZnn2hSl>4uBXnj!F(kCQQdq+7J|ew)%QhEz}Mq_^>@fSZ6bf z3Mf*G2QYhQQm)w&AK+h) zr11t!zOVm3w%$4{%COrWHt6oop*y6zhHj)g1<65DQgY~&k_HI@>Fyj5rKLkky1Tyz z&-u=|-s}AS7q6LT?q}b7t-bbI4_NUD`oXYh*ibD(BGXCH*ndrifDq}9L|RQpP;9@U z+5kHs^J1uReR01%KN#c6FC#0vy1x>zUMYI#vQA)^npiG*4;s|R{=HBqDf+fd?`4Bb z1_~>j9qXwclU-dD=zIeb*5dDELbyt^F!0FlETq#Y9r!9l&R~O%#TXDP{5d{vm zHP&=)^ZvFu`7c7eCgT@>o_DyTUSV5-G(ZUbjHHj1TaO zq^dyVbCp`%Z}2*!!%@P3TRV97AiteSTs~IYfIHD-sBwO-q9^z@Fvf=fPE;sp>|OCi zx=D{%;ir9IOnv%A(A*HG6j=yDmba$^y4|GF`^yxPY|lrZ=KWuWIMkBuu#OK^d{o%J zJe#v8hMb5RgsL@dVXq6GjOPhqJk?XgAKJ)ykqGs==X*6O4ql@HqfXYMD$3h_LZf9!V2isR8DUpEkpTj!()!Ow1rF`}pQHAx<&xYQIGIdFDimgRG68iBytgv_ zCroP+@L)V_>!D?zBXoF0&&i@++n&pYNqmkIg?2bXBX@U_hwVpgPyC_hL(QYra{DN) zVcEal@_2Ac)w>#jG%hDM5saE-^)EA%DSibX53*4Y)?_5K?UANA3t$QjE@{td%bfdT z5AAS^Jak%p?4k0k*n&xkP>8EXjl@hPjy>G*70`N2#(kHglR;e^*PwymJzW;&th}-} z5OIq^piVlZv4%Z~kiaHoT~!2nTGjGGoS2KnE3;#@G8gFlPKdcX(>AMQbjODc@_{8UQd5|5BszaBDeu=Rv#N6F6>~7v&eK(RZ4htCD zlq$2;1skT8CH0M31?NwhZfQ_a42k0M;PK)=g^;g?%Dc!UFKha>B7S#=|nT%L-Bjh4STy+3go;(2cQ!n@gi?-i_74)=>EV$3))Frjva zpFGSs6bn|OM2@W}M(BeH294+%@x#hBRb(Ek;7-TJjM%Q@kvU(|tJfe`x$K8O;ED}= z*cu6yk7u~|1B5%fC-4npmI;j-g@aIR8j;;~M;e0Ib~@OD5w_XtEI#p*@w?U8 z00(a?`Yt#T1}ZqtRdH4a@CU@f4|u2n584JlIZ*!= zsQw~J+-Ab5bX=;|YB@YG?NM9{Zd{f~#W?udXN#%&v4?`q@H0hDY-Ds{p}8W`(;9;N zo|*cSf63eGparO&bEqYR65(T_{WR?Ccp)c%V7H;~V{zxPh(V+6p_`vYTu6zq&Akj6 z9d&;ySR}N;Vbv3-a+ph31RIDkKdQ zBhJBVhs(E(+U8dLY*UilI9Zb?2k&VpNU)-p*?6ETHGc}M5i;8oO~>5%zn2icx)5#W z-Cp1(Qb1rSdPJo+hpvZT47SyBG;5BNOH#MTChk68-Iq z;v_S~c!ONcj;hZe;-2-2ZcYkG$ z+y@#OR>6Gf5Box_19(>PIgs1tn7|<<_X}0pT?|uI+%|2sZY+>5?aB)@I*J9ie*&!jVP`H?XQKa`1M2Ui|Hfw+H|j3NSch4a&dL|~{T z6PCi33`oRKAF#^5J>RhucE7o0d9a}M4elp;WAiic7wjSRzQe%l3%hNvk7 z-7WfzY%>xOi`|X!mWp5mrQn|;*sLmTj zA%dQ`W2(pVy&Jvbb+tq)W^KGR4ldXij`l!cHsZ+AW3xv_$J?Q*F}bEU>+!onTPf0& zz)=KWJPJg%@K@l&VH<59#~o1es%so$yiE-guen^e&CY2&q&C<~u6zIX@|mhB9^bD< z^H+JS(>u(z;a6(lx$1K?w6|f|4^S?|%mR$29HQNL$0w3^L$Maqm8B*OpKu*mWPTjq z8-Gk@h+D%=RVgv0gR-)-dYu*H(H3n$-Ifn~zy^g|n*!bFS); zSEq?QRpJoUC|xY`RkPqXbd4w*DU!(S9nVn#Bso z{Iti@P}G=cz?nW^!s%q8f-JRG6~PwR@9wp=gcsK5wqCDnyj*9{vS?u|+b@g%?G#iJ z+9~z-?+S}gIEW01C896yT)2t%PBa<3f~RNdMVcYzhplRgM4-3*6iB)elj**jYx(Bm z%C`7}*n(2}BS&NoH*!i$K+?`IIh}d|k#of4$b>5nqjAc8OhbtT_?m@dqIXwu- z&kjslzd?23B1sKh?{Y>!g?)NA@`n2*{h!Q0tRbAFM9z(6L@d=DE}3)tGy1tsi5xoT zKfm|*{~*ojBcZ-t4Q1^pXP3inN8g)~#yw^4`0B}#92J?2@bvDWGwr0C3>vWRusJM9 znQL5@(xBA}_1PZO`1yGL=5$untdeD0!dJupYK&U5M-&^<23hbZH)_Q=Fldy)#YwQ& z2r^69>(+5t%Q;v~n#o6kHJmt}&G$L1qWX$nhSA#E*BkJZXc^WZjE7qpzMq|6xZU!{ zcV8Pl@QmxWRUjq*k+maF$8PW}_;e)s<8;(dQL`Nd!MsZ{RF8>XutiHqXKl_E|ixNDBrf~@-J@QryI z*ev(iy0?ED1@qp;q&=zm?;&BQ``RyL*|EhYm%Yspgg{FKR zZV09u?dW4_l_2p2tkEu2@|kI9J=NFW5$uAu*g;phx#&UG@KMbKe12}c#M=}zod#aZ za8#>-G`xy)t#4NK-SE9_^;2l+W64(EEc(al|Tne(BBzT=%P zen{213TeXal=PY*za4(fU?lX~TCkn6ro!>Qg(u_9;c`M1#qiCVCf_B~pxiRs1L={D zKvBOw&Kh>WJ5#wQi)!2Yl*4srqLhb|9M4*``5j6r&jpFjWgN_r=Tq0D|9tAGy-!{i zquez0-(Tg40C63FMV=UlvoCoe$wZVQXhv&+M{XdmWq8m=k~8am z3z6M0T+H%Ka^9IVj0{uFgY)RYS-a{aqZB2ysL8&M5%)SnZ#{M-g-z<3BEg!se)#2N z8;vdvcmM)*43W$Ivj(lsHZiYJTM@2D?6q5_jQHY{ioTY*lr|sGePF1I(g-5b%+Oaj z*9M=?6x;3*+;k!dg+1lJtRiyb%^gsrhM1=jJ<)HnOk78mZym&vaog?7IDV96xGj28 zpaDV#-p-p3+_9S{ZwPw!7v7q3Nqy1{kRR9?o=eDTH~7Zc(1{U7Gi=7U7}!A<;Vwy{ z(1xGTp_7E1j(908EAAXw;;OD0q_|3Izu1X6@R`MIPRYNiJDK%peRQaUnkbwjjWa;| z(JByr>C3Ube^V#HXV7P?OWYyh&Y;hrsLu)YxuxUC-y(l)uh30r^*8p^srA4gsIxJ?l;ar#W--2uo|CCZXI8WkoV60!h8$Wjq^S`_dFD&6+) zGD09%daqm|p%Sc_JUW;_!{wYE&?!m?x&;FCPVUb}mCo?JS4AYM>AHF#e;UJp;`U~X zIQ<2#i%J)Lk>B+k;kC_b++`U4k5sv+A{E+`29pDWF^#_R;{_E}h0maPFzsoFba*EH zf#YNiomdvHmSW<&D9t*|`xQwXFlV?1ZFL2dW3_*FOhaP(>M~+v5BL29Z&dGwPn0*j8pZS!mHTq5D8j3Unc(ZRoMShELQy2YnT=PP ziS0wSyNz(k0H@5QYerS+I++vY3J2b$E}O*L^uI+Ed)))9xl(Bh4;No(l?cUFK-K`!3+)@($LxWl^>+zdev8MF zgOP$?LNvnrBfB=XSQR+_=`9QN>?rhRxYJ^fr!4nJtw*R%`$Y|wtscp3PC`8I z>lcvPmhLVsMZNc?D|HcLh90Fv^yz$)CS!;N)P~YAD1~ERhsT-WdecXVpnd-_%XMWo z(G-3lm0IQjZ_E+fbJDA(-2J`X2i3qMVNb@Qm+k33lqH$!C`*vie7W2+O2h_2+GRD= zllaRnIiCGZ&Q@hv{d~xb78iZnrSZ2i8x!LIS15twUg^omniZLlp&N&pZj{#@HTLiF zu1Qn6)sNXS-W!@hm<%C`8q-PQN`*4%T`=6zlyz`>-dLZjN6vgYo8^<40lM((444=W*=B*hUec7*&&gC2*@t;{qe6AGJd|4 zR{a50P9v5?=<(g+hf#0QQ^4_*E-LaAFYm*_JZ~jzHCCA_9{7hCvlK@k)P2!_J0{Yr` z4M=(Vv*u`2mDUTlsKQ|teh|FayLc)>!mZ9elLnushj3>ATUkeE-P02I3e9*lzirFT z3-LHhDF*$OeUzUGayrg?*}c6arDe+!fyJg?8}v$hm`Obrlt$x*f@BYU8|l;`IZdQs zr?a?X7(Q#olKt4jrP+{s;iWax`WXfDV$ysl#f^0?nV~HMfoxhOm^v%y1o=cmbENsC zF(jTGEHtMT-^i(dBX~zlFr)HrgvxPFGcDzw>Sj>3eD;;Kn>UV~Wy1M>QWeGGVU>}0 zEe7;@-+pNA6_EK3LY&)UBSz3m_Sv1WmB1Cy1*eGok|5TvDM9?<9QP0i zfCKs9#sR}rV3sC}t~llj=^hLC9TRz#qlb@BC9|G3xAf4`}s)4#DctZ}Esc?HXh@b)$p@m;dLrRF3If=iL@q38wf&j}^;rhbwn|>q1b`BF@?e%!I z$%RbdON1xkR%4&7iQK2Huux4}521`1O@(Zjg-OEwh!h{z-$JExk|@?FWbVjQ?QKvV zT7{y*1he9nzLl}1-eYmyRQ$rxU;|Mu()bizXV>+PgB6`kR98!Sj^siy2VMOg>cAV) z7j&XA5?mxUt=R$0wSelHb zu7F9XEDzJwj8P1TpAfMc>M9JUg5T!KubaO9YnKVx;7o)i+Oj6HX=*Gz3G@t33V5FA z2&n2`i$))HrikLvL4J}maTwFG0H9b`5^zfL5fZccN4JpxblaT^%S^dG4PBa@@18)~hxj)^S*>N0qZ3 znM`(@m5&=_`KvrP+I!Uf|ZBbLbGiAtjol}2vPx)5?6j=KaS)|vg=cVV(K`VTT zNqr>V&BJ9~>bmziYkk&lR84u;-v_}H)0#vLu4mJ7HkN!H*xLBgHXhzF)WEyx_u%VFRDqht z$`^W5E~xL>osFA#tdtM_)47!%UwJ%zf_6zP^oT$jn|RY#^Y-b_ zfN^8eOLw6 zlIW7k&e0k6tI+d;Fo zAv(fqnF+xJ;`=4fmHs!QB3dO+*Zl!DAd8#l;2a$~Ru({k`?nYWR9(3%;JM;c8tT&2 zy6i`ynZ5&V{U+=x$j8rPeE{0D@4d*k1up zp&@-Tf5l(x_XPCQgJxO zx6J{j5}x{t6W@Q)r}AMiaV=6B*E9=m<_ObqN$e(Um^NS??^N*89(e-vZKcdVtA`^P zb^N-Ssp`H}nZ!h=f&xT%kLGP#GQ1Pr>Pc;V&sb_AypFqli|H~qZ49Q@$E6eHL>tWQ zi#`Vj|4rW+i)&l3;|Ew1E<3Vh*p;LVIMr}dNPbx6@j8OWISJ0rJ6H#L8(Yr29Tk(D z6A*IJe&1Pui0(tRpfN?fq(G{bL&OQdbp;pa_>%wupP^#94M%NS z1ATpFz&q}>9>ec)zO7MeAk#-C?oS5zM2;+Ej;2dg)BfwzL;t5wH+yx6geSM^^*6IQ z(J#EUp73i2g`bV_{V!d;T63EGg8Z>Ih4p+iUDn&jw(dad1y`IVd>aiA^L6k^9ukA| z+S7w&SIff<44VK`ZMifxKN`#f=|>8srQk5i39I)xYaA7Q>RYJaUX+j7>QL?bEOpDi zMFE>!e;2klR?~nIH>xCKq4;RKQkjjCnmEh+r4@YFqDAlD-U^xkcYpVxhKC z`V1hp&8{!)^(?}CgU>ON=k&Z5U(oN3ewhy26!j>7mMUVxIoRts;;9XTLIjGM zEGppa{ctgJ@`I&vqy6WWa_%h9f(slQ_5$!L{Ve+VzkKNl7g^xa_I?RLw;A|u;xCqb2FmKsMFI;*xu1)fB@jFU#p-_o=`UciW)_taR-rm+GU~WwLL{I* zn>h5H@$;5kx%4mjvGp|3>*4@^!(4f``4+=6)IA-|S`Rs~>{=qyQ z=zqzO+!WvQGYkK{vkEkPHA>BFseODIv{3+$bhy~>?-bh}8?xXdbc@82jAVL_8N~P$ zSCKZ7CB4>8OHv$lg+HgI>_w0v_TcdQ#&jp*&y;AZS%xsK>uDtcSf`oq$3r!DfMLx9 z)aRH&5A!H_SL!dDj7#e!#zDkr}|6=;Vczh3^E0P&;G?7KtxEEJ%)t;$l5cy=l-=?BmBy`K2mj>VtZPSVMxYemJuki*8r> zK~954FK#==hkW7QGv&UMBmQKr^=7XpAG_HK9?yLBX73v1fAOc<7mvjI6qAoTbgU-I z^fwd)*q`r5gpdAwZ#`sA=C29w{qTnVkM8r$hhI6_tuNpa8Oi}LuU&9X%%9fi$rj9f zZ-4vxR{t?vC{;5xY_$DUx#bTHmW$P_>HU}-q7FpR4Ugz0N7HPzond5izYU*u*S>cD zKtk6(X|b8@=G!VH=i>V@4WnYWq{_jEGASIv6b)mYj8l zTF>MDiyg_%H+)JC$4Q||Meo8OZ{WD7n)@4Hk5IJd{BJKi1~f!}h{4tc_$vbTQcjq1 z1&4zBHAX!wf^q0Zg9?mO2pTcS$dVMq7>;T+u<5o!xb`tKIq&iv#1xGDY zpO^Zns)9KgeU?jgjk!+aIFV5KEyru-e?Lvg^UE+w>i&4|egAN^GTQ{9i6s*R&1oS; zp*aHC|4?_vSaxViuzn`Fj|TqA!3^Hyb|`&Lnx?zjDbD?htxRD{6l9%qu;74g@ChloT^pm!=F3k}dZb9LL+r5 z=&^6_<6QV(kbsP(T`Y;Z@V9LV%O+B>2`2uZSTWq#fNR`c1oVS4wS_8-Fec1}QAfn? z$-&NR)taK~%4;cs6#n{tSnAw?dbZ1s!a}C>KYP#HPXUj|7P%b zmca2RT>rKDmFv@*tl3&$L>j+yw%oAA>MV^i_VWA*R4#3=yIaQGzgJYRIXaMNDEkML-=+iV}{gF(jw3FF*HYxiJ zN229&@QNi@Q-lgorBjO+n8m*L7;Hj(Z-wjokENpCSp{5^(zGCYPFJ@g+aAU1JW@0k$q&=;{Vf*bd=*qu00=K z#`}Y>IdThx1ld=e&1@=oWS<3WToMSeAA66GzL|)=BQwW8S^L0Q`Ix1b#DLOXjhA&t zed>2RL6sU~*~7xobAJ?>=}$qXZ|ATu+H4fP&>;+ZH~NWJkQnmXr-@wh>BQHTF-X8^ z->)iLV7sB+mI{?4+;upTfcRisW)b?_!2OYXu0z;PcEkqvh^>GTDb#jSS&As@W@lYV}ND5Z&8(f5Nbw z2{x$^V{E+Kiz4ReWb_kzTlU*SV*pVa(H9AqDO>oGQZAJ*20}d@O}JI;RUh`>eV0~; za^FP=E84-g?y-^XYe@S`AJs3%%hIa?o=efj3@Sv+bVTBgoPS0g1uro|)9V5Z@elVZ zdMOtst8Lo}SH~%o2?#*Fa=1OTTz)^cZr~4FTkKul!S-nbKcs#&yVH>C+$;{W9WTnx zz}aYjUB=ZNO#FdY1CF|8l&1z!y*JYrUg68%jc@0pc_IQh*zMV#f?f??4&eDA?POMG zzlNrrn))*ob0zEvUxByd=t}3K(Pyn^zqIWYQGRl9slP3M0k%z3L%>&ih!h7w{JaG2 zFz?`)F?-pJJ)XIjY$Rza{W;wn7I6R3oT`iF*VKA<*m`g^(o;tF2%q$5C4GIAf>y^ITm~!^;{9lR{b~HfcEq54TE5|UevEqL7*MRiLFh?vx&LIh@)bx!7$^cdCak!`W=w)+rn58=R@y2 zerCzYr3v9Baach>SEkEytzjZd-uHQud4xtuie$D1pXmg{o1hn{u#Vh8%V!Wm0Dus( zKx0CU7FZmm9JeTfC6A5V>+{{;dqpVUrFfhc8rUW#YQz`8VDKv7Rgkz~4aGago-aTuiElic*I~di@|1;<#>Ym@)jaMX@^Ak(Z9_( zJ6Agzt@$_*7SoqMi7y|sDd70q(C6pp_+BBC+Vp08T$=oDQa?F$fNuQIL+6J(J~j{~ zEHBaVLg|P0E%h)ubDD2P85LuT7?1XXm1U`E`nfHpmGPR{Goe(@}Ots=YoN6+bvb zHLU;+e}Q46)202W$Qhc==!kbRU}8Vtv>q#L#`opq?x=Hptv@R9`Z-;xF4tk+=m#G_ z55Dev`k$#GFxQdGZ9;`L&p1~bRD1N&vl`qAdrWLK2PvwYQn%Y5XEmKozeF0}DJ|zm{A0ee`{GGn?=h_ z+a0R zC8lF{?Ona1j4Wcd3T#%AaHIno6bnfJzKJ_X8yS&3udDzYY58bI=i#-Tb& zIliUPK|5+tr9nN4#nMs7-cq=v9p_`&TcE2FI9?e1kGDlJ(IwMxu!Vp$suvc!zI}q* zgq`z(CE~{|7CFgOSrk1$*?IiRn5ft~T#p4qz7t+0*l}C--!9=PC@C~OvZ(Co$_3AB zb8TJW?TZ|4&)D0q?`qRV-nm)8VPyX5YmNF4VmaPX4_$751E$Q68CL$^mM`t)u zZKX9HYC^5oDP$10%?Yn_r~vb%rFbG#K6qaWZ}L2y-wF?ny0SuMpjvY+!m0SkwT- zwzB@L#&f)i&($HDi`BcH>g#yOir>{LaMI;~z--zmqy*c(8L$jrqw|VMBo4uNPfsel zv-j;z@yX-eky0x70Z8c?Z-Q8c+ejIGSLl>M8b(bJh*O%Qe`Zmxa{hxwNv2&DxN66v zMIwBg>Jy#l;k*Bnj!X>FZ9WH@*OMVX68s!-l`0U~iGiM#$vCv2Co_uzxxT49HB|R~ zGJ<1Y=CmC8>m~vzrikJvKE6vK)0iHLr!ZwEWbcilxnNXVJ1Hf?6*D!L_!&d?ax&>t zUnQKU+k}~m9C|WJdT=F|XeN~6mv&RJE@zPTT&Hl`r8JJJ@d&{ZWv6shEc$P+zhyE; zF~ixO%9Q9%cDT{W!BGj`kW>bx!5LZY3UV+RW}~Uv{0;ukYKHx-4v{flQWO>5C%D6F z*DR3~&u0!Wmh{TvnP7egDmx;9P5cEp{7VerQ6}blf@!PeiRfi4fycZ}n>dcl zrE?E!0Q?!ktt>Akqr8rfI-l}^B`+^vboAABy{$I_x&bVBNkBF8qlw=j06&ZIvdx4y za2rDa$WW;M*O1-%3vlUtS_uGZ(0>U;1Cg6()HOpbJ+HLqcSDKT^J6AsZbBCxmkcE3 z!ur+x5<;4!=C#`%;wiNA2WwV)Wg=}o3i$#Q1=h4Gu00${vdO=}C>JKGZKcO)VjXLt z&@D*gHb6ruD(W>}24dS^TK|b8eW|af=JuW}#BkxgdPOEq>5k4%Q^&5$0vjaB7Oh^Y zbEi^Cw~`qsfG@Or_nu;9 z9djTYq|5-D#l)fe5^cW3Lo~rew6n!WeK5$amU2)R)6q+#4*u<4q9oo4Zx~AP%XkRM znGD5S&4*qeOfe$)8QQ5w!Tkw}#F_|g4l%mc*kDE|z<>8W19F~^`~~9+ASpNCDJ$&( z%waS?`l%Myu@hbUYKB-PoXmTcCN`d&bA9Ye4l9hQP57)YFpPPJgC_~ye}hhCK0^St ztX1J6dq5qi-8)ce7t0kqccdI-Q1ua|@MLXx2ILKfy{MMJWFQUqSr0&3XFCpxCKJQ~ zV%WPA>69{K_aj{HH&ab57Mma?>Upoeo3!t`W#${CU@67fWJRT3733%MrO!kL_?~WDO z8HeoD%k$c-UW5*Lte6y&yBgz(byJFx(1SNsZ&?#SUp_G3)#TPz9#`jE_|^q(gIqX4>L5G{*_< z-~$4-eF2FgqiKf-h1Z{#Zvl7Ho*q!+pQU0o%5sDHACKq(hYDcYb3j-I*`MXwxoBMR zA2$94vyT5)Al&|MfdC$MS9Ps`H~T;bQ36_v!B<9}5@USo4QE!6QALs4U~o`E{QK@` zFcSsC7t#rEu(@$iwMQqm=~I$Zjx@B1@_}LPv4M^p6BDWXhv?3=hr8I45y=# zp20(*%B@mug-^(MOvZGIOkliGqfEt)vDhe|&1|fxN}Lc+0tQPSeXF-|Iz3(xVKl@n z#0h~*4|n?EhB%~(jk-@NB76+SvOhk7tBUa0XJxhP(cNO4QZHwA;C!p600DW^lq1s$ z767DQf`FrgDaiq(DMs=gju#+KR~T-&S*eAr172!ML5{#~7d0!#y@m{qnE4y(`?M_l z>Jf5b4|xybhTC?ZtLOe3OXG)>6+r_*zAtdEHC-+HIR3h>kN(HWlaBsJm!>D+Ay?ux zMJWQIU>LEIBhzIhjGb25mAEr)awGsgYL0%~c~Tj+Q6pxZyH9VLUH{cnk>%3k{m}U- zASI0O+R9T;#?+Fx*v+dITy2iSn8;VMu#B_RIo?lTzoqkn%Y&#xRChQd4cQ9fqtw zLy_mb%f=v3>RIgGuV(r83^*so9sAd5+49E>AgHk0_8-pjCUI)ME!7Z)#Ptf}Rsuyj zKkiMS?SZXzISE1!RyuEDG-eg6Od}zFS|7o#Q1~_U> zGESr%8P8zkpCH~@>N%YZh|-b)gB3jRdq%obeo88D0u2MD83DTy^kf~@Yc{A#MV&Yi zvElZ}YVqCF{|2n+!ay;Ot1;ra?9C7qTq|z%H_HdeG*5f;4x=xi4ZtToKs;UE@AqeX z>u22ZfzK*tq5n}acfl<_FJ_SEeKF+_^JMs1*ew%j=k}IJS=SQlZ8c7aZDyS#MS0k) zZ&dPLKK8^KJ6z~Y{I12?uUYS)KHXC!v4EwZIyCHd6afE~7;7j5;3Xr)fs7Q&V;IV! z8b9b8NAH#PZeatW-vZ0aBeaw{ufoH-aY>iPdX}GM$K)7$ zJlH6XR^}~p3oSUI#>S1F8^M}tz!%l*?Dq`@8sF;(pd5-J+yQ9Xa;ik*Qe$tXl7lW4 zcO0f&rt>*ARL^Iv582Af>JLCV&35<+`#(K+0^!v`_n`;DXD@LXWq}t1=S)>F;q!Sgml3Cs=&-mh)4=Xfa&fu^;EB#wAra%xw8N$>_mfYNnv|svAm4-wtRpc|EW7w4K#_si7 zC^?p(%o6>mdo5YKZ}KNB$1!xoG*p0c{*by7Y`Il%s!bpNd<*$A0Ng|uT-&oq$aB%? z5{FU>aMX*sIooP=SpP1C^i$aLFd1jM*?#x}5D)xAu)^K{w>Dl+6+5pWOy&h6k>4Rr zj5^BBCy0xfWJ060W#5K#C$nM=a_APlMXf}zN(g+fN}z+BTDWcKzL|n#CxoZ3!_rJY z2w96n4mjjovnYq3Je;QCmiZ*TrDAJjjQqxK!1Ug+tAU1Az(+shw~pBa6?q*k32E#j z^7>rNg#0}Hxr}+tNK=ACcjgMZ0O@*`gi7O!gnMWCVl%y8#GG{5;Vnjfb{Ho<@h2h> zkw<( zU9Tb~4ysnjyQ->ktTirR9GLj8JzV(J zqaEVt>|eZ^cvY+0+*a{K*{!yu=u$2fPUm+DDYPa}4gHKaq+3o}V~~0j*XT~|@QlsR zRY)A$52)DFm(*6)g`mx`$}xUYgl76^R&hQ;wGeC?O+VWA3aVTJx z0RFJ$^$h3Bqvy6-RQPZRpktq*ay8|Z>8u9QiP!IM+F1D+Wgwj43jmb^OA%Cn@q0&$kRzR@jRnu*JX_1haXt~xTv}R;G%%T=NS?y z=XYO-93Q%keRAm|FWio?1Jk{BH2?O^G1pn_)y`HU=#35K6y8nQ_p#)B`?G+*{5Cd{ z1&Il{UqT&WZ>m36IdS z9lBFOpIL&uI_+>Rk(2!ROv=>WT#pDCkHt*`uhcjcRL`P_rpI}?U>Juu0tSkW5$As| z$00ofl~+0%876IB*uYRMP$e0EtxC068o-2O0G+hc#JKLW;0ZVuJg=Wrd)HdZ1Ua?= zyXgghiB5yo&c)YMlOA9zAQYQM_BptK9d}32Z7u0vMeB0r`hS#eBen4XVuo0nn`}v> z^gFg7VRzY{&Nt>NaaSun@~ko2XEj#V301cj``&}|QqbZboH**zKcw0Frs5Uj%W9HT zw+OUC#ZQ~}dOT-?l$8j)X@}Jla2Ao{EGjbEXiAW{__G{96KWOiq=IY(fW7W@sp~=&R3`?;&G>KeNfb{z@F#3y2k7fJ(VKW3q zVZdwl-_T%4Y8jR%9X?j1LTY7W^LMXQvvhwZ3mKN*8-`9gRv?F2dIYq%7R12pbn*Q?P4tmUaGM6HjNjFp z3DpY8+lbH4-}m1Cg6$}5b=#CP+xY0lm$pULE$0~p_>>G{^{F|rWSd4Gx;f*;Zf0{I~VW*m_9fjoduKg_r*_a9DS({zIXxwcTqu<@Ho{}H@(TO>C{%dfCd8(oFqu6Jb$SA%m+RAp&i%QP z!BjXKg3nkU&|#+}?hS3aGUdG@0g3y>BaZN1FXfSR~i~+#BgCX#!IdoaVpf4>58vlD2CwR-wQ%%CIwU5OFc#L zbU@#piec;tWE-&t#kVD~vaF5XY-ERk!Fn(`YArZw$Ygj`k%_0^cag$Zbr*M%#?|Rd z6Ft_J-vKcFOo|77I&%YB<>Y4(G9iQoeA=|&)QXOXLB_ZyAJ59FQ?5tdPT*bj3PJCA zRrg|#q2d{2s2cu%WW9A%6i~Z8EQnGn4bsvL0@5kY(A`Lhba$5`C^>Yev~+igG)Q-c zbcb|&XS~1rzIUx}Ef@djl9>}v?7g3TJSa(8PX`6uRsskxiHOiBKG-wN{VSL6EB~WQ z{@-%Bq5f~LzIRZlJEu7o2jspHx#HC;gZ*T1ZvO}V%c(bM+oAoakvpow(PR+CQ%t{1%hc(1yW&D^&1E68F8WgG8@|mSLV`%)vt2%Wa>;%pT*^hO6_eIU&Jjf2- zH2&4T{@5GaADSu%fXMrURK2+T%ecLjKhpGPcp1+M5!Kcw6`_++B|vfV#*@hNUCPa6 zGzc*F+5Lu`AY3KE!8BhmB7rp9m7Z?AE8_C?U0bZDS z2LWETaJU!PT~Sz3->RMO&3pmVd!l(`Qm98Mzt`n?xs!jJ%~mnM$VFfvny+q+WZl1K z4c87uN#S$sT92omD$)JrkBYNbh^Nm8Y!iR^3?jF{i_pWwlR&sG@N*_MCZIr4aB*>| zy(bxW+~${Rkw#suBz8JHv)?+Cdp)tPC6m+?L05lQWJ}j4L;Vo`qX@&(e=9zeyW=+WF+p@=GH_Cux!0n*r(zkAuK zH`HS~7OJ8PrS@n<)|m^NTl0{2c2YAeoruFq9k!y3tB1Xfq^BTa!h|$CYwbwVh~UP^ zw6h))0kH!;42~wWmk1uUJ=QGy@Y&nHr$kI0?ewEzDVF}xu)PAvz_ zsV$R=d2hqVe51z(Qh;7w*OCf+VF^aEjxC$2Lb`s4`7%m{^ih^;1~!O!$qQ7YY@eyIG#z57kxCeTQz4f>v{Z=xdtE#%d_nT!Z zZSsqVz`Ws7zKnYnB$*r}TY0K`FVv*&>|q4|nCfDR0{NooooEx3=bL*)p@T za(;kXDKRlov(|=G&-czb5Xm!>9v2NfBnC!LNBJJq8c={L-K;Ws(o!W&;wDHCgPBzWk6KTQm4=<{i6IM!lZv;Qey1%bpSP{>yVvh z_+b`Cg*2pO_O0GeY3d?RpEhfOz6*GM%7CvV&GyQ#Z|?#d zGY8T|$SJSu^4CH8Un{b+1m1#2v7gnaVP$;*x2z68Cm_i+n=B$M&?sd^?RfeH0k{%J z?hK_0&}o?kTGoe@QDP!e#810X${;%R%(ql@wm&+1M)gAI zYgw`H`5+ZSSFZ6ct?wtFistXKh;QY5$AYrdTP9s3l*hA@ z=ly8C0-6vCWEH@{+VM8)7fI%!{5c!5WH8V(H6uRDgShpjl+TmQEEYcDI?a}Qdsh!b z#WjTo_RWQrYB+zA0&ny>4^WO(rKqLUJ1|WVQL*&~o2j zxA31ROcmGFj9K;z7Po|bh8DtBl9BaQpX=-`F^oDp0lsMNj+(1KRI0=0CJqH~hGAa}L%{By@h&J+K7v3S_53GjE+Qi$K2=q{VZav8 z8~gsTGk9yVI2rdlbB9&V=&8h7lMc2klfl0){)L)5G70=yZFtIpho! z*r%YVF~Xr+bjLm}qsZq0p;q*u?o{ zAu5}n@YzN?aeyY6s?O^q3wt-j3>@s&F2zOl)tl>W?Eup7X@)VE!D zCKLo|5q&7s^B>j*mp3%Kk;^7?KeuhUqQz%<@85WkDKYP*6UVGY3FTA(uSq#>3>+VC z36kI(VSs;q_p3TXNsE^?@Ws@>{Y6s{4Rp&yAz$?io=6Hv*tJ&{Ynr|^`j$%+hPL9t z;{S>{c*Y8fO*6!qtu)F>kRw)a=VU#XV2z@4bUe5I(6RQXv<1>dP}z&${!HulSCrH{ zR8=mE2sH>%XuJ#M+Aa6VE0=vk7M1Ocl37^Pq;pIWLq*TMT<(!9@A+KwaT};ec;Vy# z0&@DlZzKo@in{Y1h-UrQmV9oWe?i6={Bblg0u*`dX2Y*ykXAHNw!j>VNvk3GzPA(^ zeAN7q>_0I>kstLB?0R6}&Du#daiD@U^kW4=mn;rpKjPcBXxRSFfhg#2*U^-QpA^!( zoLY;rJO3E!#KO-=LZk-Mu(o+mTS;h-%3$PZ9|qZUi%kFhy~g{FA@#-}=F&s3V%Cm{n*Y)fB z-UR@CP#nq%FF!vKaJ%ou5V=)6lC!t|0jfDLWS{UK*$KTjlGR6jMko@wfHxL_;p&I# z_X~?=IVV_V?_0mVa}c0-K}Sm%5s{L_!aGLKNP{}~onn;SK!QR#iT$yJf^J8>%v6r! zvBpa%1A0FiD^I-WMsM`%fd}k3UG0ci5)A0j9HyWZ()e#}MZbnUHgtzlMURI-beqg^ z$BPyC{XvAohq7}TD*f_UDo^XRDUQ#gSkysrbBp?8Fqo;*D_DcfB&&A zlK&Q6-;YyvUZ+(*3BPi)(u}}y{4k45^$9fPm>+@s!rl~LDT8nz)&1Cb-z8Fa@RR~5 zr`PVH{N!{4^_p9lE?LF z7j%yr+f-g7q{DCerlh+l_|$zU@XIr8mTbqI83x+v%}$gju6&Lg&-{nVYt`|9sclHO z>;5nGqrC%5DDEJDv5(1L%E_%(qFBqB~^X5&F3ZC4Kn(I{R?qley z_&{d4#0x>i-DP(Wjp#YRhYSggK!%Um!}-MNkg!_RZapU!RRY_9H6Wb-4g{?MNrm)b z65(XO)$yPRn$p)5;FEIlPQu&%QKI>YaPf02U}v|$zWIIZNYh)n zm!q6z=2HgGwkBuQR9gPooqRylgMUdGyZJECsc?XUWyrP&@{xd;;mUNRgqDyY_*Me{ zZSP}VAol*))~qxpcPBFSB=L06s4mnHE$ni}jE@L)#wFeBA8nAb9xqUjj9S-yCwC&Z z0mQ*@@pKDtswlU0KQV*1sc^i{vwVtsrOsxdA%V*hpXiO*D;nP8kM!SI+XXHG;}zw4 z1kkdkbKXH~HFz9)Mke^uf=(-gl$Na{NHY%;6>x1Kb#=VwysSu#qx0}1e#rWNq7sS~ zT}ZHu#+qOYVKpO4ty7-l2f{LsQ=gYTbJOwA?-5!0Q|iMOsu8_r*#G9ifJ9sce5e~- zIt*YwBuWTps0os-abOR37y3ZUHWq?~c2MDYVN+#2R|8~#;5EH6lix(GcZd~$@pC$u zPYhV|tF0UuhaZLF|8q#W5mv03qekv34KN^n{4}UFnF+G^*)K=IV;_SD!*NvyN4@Ak zJrPw&3vE%a_oC*nw$c1!@Ae2>7_e?V@SA=s`#)nhph5-9CdQyZ45k?q5(3N0$^@&S zB3#M86@P#S<;<}G+^n*NV2#yn+jRU?&*y4g?Bb&8%jLSv`~%3UgT2Rs0qVhXApc+f zHvaEn75)sQ%3u6qA~u8es)upZdKt{?@Z$q32}&u#1iwMCe%ox1Jf+;ZEOl;-l)Af9 z>kuSy-eYSJWCgowoCr>(vHQi1!7q45EpVl{z1mE_1h+Rsm9hEdPw#jjhspr= zr4C5D7kCE8R)Oj7)>5lTn(w_k0}-!6wSvM8041Ojt#{{h!ld!YOz!_`m9+oXDm3Yz zE>{6#R;|r2nZ#$vcUur13&n2z&W-`9roo0`O0I zBDXtnv`Zq$3or3An88`EV?N+t;)I__pjaA!ZK@L%@7p!9$BDr=%RY`Le#CMIIkjdOq@1p`5TpjLbtCJuT^vo}N# z2qXizu_<#j0qpOX?Mx$Q;;+(E+oKag#P5oTFX7eSB3$DtKYR;tj2g|PRe{`b1t zQmeM#y=IlM4 zcvi?IpsXTTL*4_?1raoqkhYPRuBGYONC8EBzYM2G3y%mEuQ$ch8(0d<$(A=ij|A@( zIop+Kp^>iAAB2Tx!>!s(y_Pd3kt?};8vifiBEvt}BW#|3gFC!H@>7mOZ@fMVUa@%X zb!pep+3D`w4<5w?&DQN_G(1MPiFAQjR&XJaeMI2p=;5 z{057dAS*fESBu!b?+#az=@M^EjFPgEjN(78MnuFUxq5h?MjUnP}RT}LmEQ_ z37VOfNv93TpR&A+Kq}!-&k?cw$%%=Ts zGkpG~og8yQygF4+U`|R28g?O*A-@a>e zGbVqA?E8c{@dZxLyFRMSo3R&D!46SXj=}@w(3=wpLZ!zBskHdUg!8qy;BwXBTGKAU zrxOw2mPn%Z1nIqSVR*Xlz1I7POB&;|t&!1m0PMZu=jYdlB=#D$1G!+i`4}x6kb=)g z+?n7rYkmR&Y6t*$#&AIK^O7t4_%~Gz&==9)FSxwY&n)dZ!mP5;LH5usQ?1?XF{jSU z=9_mh3&dX?xyziD=;)#|SR15z5q^)1TsRWxzYXHRz#kgVlX*N6KKBs=AFM0p4og!V+wIngqM|68oMRt& z8=fV`U+|_5ZMGl7*bG|i`?ht`^v1^z5Ojad30e>COY?ziCUf=lbRYB?%RKO8_T2iI zR5MVnU5}EA>M>=58)%KU#(zY2qjN_;tON6y<1S)bNpCnD9JX@&XueGKA89ey2gkYN z;|SSwn?LX0$n(y_%Ca$~>iy_vr_`0lLN86<`9x+okIm*oC$SdsvG5u!#)LseBmIi$ zmxwv&5P#P_xH?{+Fdr{jXY)3#lK-2fS36S2jr64v+k-u z-n4#|#O6YA&f9%Pc>UaH@?;YIMagX>dI6K2VhM94QUx>ZOSS{i>mOV_eQ%5$6$=m? zzdpuENAI2!{=(N{Mz+2pl>SYOc)Ltqn4uAzQPxJ!Ay-h{h>ifa3np!(wpr*5xK&AUPg9cprNa$=&QR)+_cJa$m+@B~WGeq2&l1$gsgr!up#uTCgqj0JVNX#j`_<-=ch@gTR9rNR z1Saez6{4Htx-kw*Q@WT?@{!_-m!!J)j`r1w-_L9%r;6QGjCCCTMwrz#a~tcFR!LT% z>+Bv*pzyXX8N$Q+nQ)g9f)S?_SvHz4V*C=dV2MI|Ae**RS^a$!ZdB~~wR72XP;RU& zix*h17#b*}f;zE0UYVq}5>2M@aMU9<`Ty@_LTbgI9Q{YvPyN+f@MhK$^5P zU+=ixuv?O~y!^q!%*-wy;|+&g2Gq^7BL1xP;m-E4h`IcGIO!!*h*?8?oQ=3LeOHJ8 zehhdULuvw-B|qYa#c-Ja5e*JwuL@dr%=KUELBS#wEb(?K8;!5lP-(BG8L?_9>(9rj zA(U1sdK`22uH7T66N0F*-gIwiW|=mU!%_Rp(In^F}G;=k49yR*LfCcwb63%fnjr8v49j9!kwCb_6kV$Abn5rW$KPvlLAdX$ID)A~yb|ei$~BQL`KTf+EU$ zp3&K+3YNAh_>sS1>3dddT2HtV*M2~sH#j7OL1=-Utr@`esR#9&WUKxjKukGo;;;q9 zR`g;2_3PI*sB+^cDVuD7gCA0{R;9RjkYlEp#MB^O0tHEVErzq`qo38P;?Mf9A`|W7 z3YJrlv6e5SzS362f;8Inuo^zbcwXH$1EPPEJhlFM2o?zEtdxsVhqN1b}tpu5obVIK%v+ z{e`~9j(VM7rs`ZJ72(Z#J`FII<1n-L`k6j=#X+n(Jcru*Qi39J?T1BgrH90vt@tx6 zgf~Ht=hEA-!a+2criyA!Ti z^;O@~TTv37Sy5irRc2bGzO=m`4eP_)**%)KlHSECJvi+;?z6adF8(fQNY)5piS6f~)>@Jw;rr4v+d{8$6TfVK?!Ias zIAl`kC8?;Zb1z~9hwd&26j=(nrZ<*RO4_n2G1XE^zj z2bb#a^{cLEleZT{A`sU+P``VWovD`0{e7HTGA+>{{%eV|b-4t0xS1}%L^k77z7c2Y zN?9J(C%Zj!Ccmpsk5Jsu5m34pE0l?bD$B|uzCK@ru)VGFx$}^gk*T!w2Wza2mIPk& zta3j!2@VW=J(HGY2>Lv^huck&ZD0h!p@oe692JP&gS%j)Oc)i41LwHc0e3%U^7z@= zBQ+B)V&Jie>0c}omrYsi$SMp8TlmP@}l?6+L|fZ5WAY%0T%aJ3i}qCF(_6)2x+!QiWsrNrIqy z4C99kHwAKDRe8d8UqL3}D-eI?LLx!((xqJc*%BN48@GWq+T&*G8pRbQDY1m=p2O}~<_m^u<`_)s@?4$EY zyal=azNoXW`uK3>D(y*XBDSDl+Ip&B11duwQpbKds_kLXWw4uAI9{I`IXT_$dM-nSOwRT~>#K`aUFeOMF{aMkAZiN@i zyi2cztDQ+&5IC(wVP|+6kdJpY%zn1B^$Oxj;pCI)V(XT(qPnU^O}sjucvK^i@=-bWRh-)}oO`=iWt}P4o zD5=6ahzXr8mN+f>o!Ue-?gE$CJ&UTk5;TBDVtb3d-{+SVa%61WtE`p$Sh?1$+*1EGYGxHl}kFX3h^eO}b3}K_|feU~R?vq*F zGCS9*$tyd$J!m}a+p@qP_=`U0Stjv@t{A}1@%IezwvOYjUS5J~>gxP^3i}08V!SSR zTBmnnyxc=iO;hN~yoV5GRh%UE1FcCkna8f#&#%9V&(?YCHzIlMw0-;%9rbrRk@D`_ z-|*zQjh-<)VwX{N9QBu0oR~Ox6j8zZ-``1uha_hsyfJoSh5GaPd>NS+w41*}szE#4 z@$M|CwkH~Gr8CZ@l@2#}p-4V0pd!aSLml6jl0LL+3fVs6YqM*LxdHmPq27M;N>V)x7mXFuU46=?37ntZQajz^Ni!S`jJuH zqU`1s8(=#%zBCNqFq%@tlqIGiI#aV^6PBBbj_U}&uG^{jEXGs+kWYNE;P!^(^`HC* zIg)`9afaOfvQz2AjC0+8%X?n(x7`ZV?tt}Xg{ zdLl;V2O-LqLco#wJx=u^xxe9VsNyen#BRuFo$;<~K^WhvYiJJvD zH#$C+^lwg&dH(EUagufm!8nL#pp1^r*Q8#VqmMdbKxL3~|B_79pJ=!#|59w_`_H5$ zY6lZ}vt2DDK`8IWQgXpMn~J4%V%<9&u?1QY6Aj2~Pb!UKw*V)(uL=xyCyj0XKI57d zUnQrNA+UxpzmFV3mRPTe8&O4hYlsPnK-T^A3Die*(JfeK0q~J16(ghUTzyPlj<{k* z3yaw%23=D70eqspvD=e=HWg9PPe%iSH)hpB?d%T-{Ei#4*T5ipU4V*6?;&z*gwmRN6JV9e+%Gq^Lf8V+xeHXZ>ugTcqr*d|*~BiQl>43T^e7zM+jxNfp70 zPw_b-GYGaMTf zo7I0$>m5!9d--aP6|}AGzj+Nbmg3#ZX`E}~gI^itLXZ@8!!ZP8) z*ZKDpm2>k;`VZF{m77-`q^|RC0hRn6NN!d|&H}5{{)QJS-|Y5$Hd=%~8jhK- z28y5Gt+z|Hb-l0;GaFEfL4FZiD9Ly2I=;L(PDK$XfBL(-{fk(Nz(-~3et88@TUv+Q zGcxjtC7)gtuF(@OWKx-(>_rhzts#5?XBnh)B zxEUZ1`{7a6$JOKvbWiMf$Db?V%+ z%wmj{Us;b~SD(W=`^?4!!&HL+mNK|&%70v5*e(9B__j^Oe=T} zeeb>E2MvIn@r^+{Qanoc9{VkL3u+`y$ny^uK(>3f^yPcHZLM;q8A(nFcwk!#<%Zv^ zJbG#3C7IB2$nU1sy2DkZrGsVBdb-`oK)ZbSx6rKN?zo#=1|3Fh+oS|DW_>R;_I7{o zS$lbE+nf9sE1Hi3x5Q}LKJHgI3*?9}k^c#ls>wq+n&KW!xLK@xz_e!LZA-S;j*pi1 zu6%}bSpKNaUA;p$9aT+vzMo}D=-pjXIh`{xWPF@hy`vE>B zSlA@g*e}20%<B`wJp5syZ<*rr9|h%-FXpiKgr){7`~(7DItV?XtHGcwuXj{E zdkHU-Oc-8cflSN>zbO6qvW43fc3u-|8Y2RSO<8L3v^NZ_thoaNvhclN85kMRSzTSt zraR78%GwfnxC(#QejLZ5P0v=!62fs|E@#(rHY&5-c-XR5@CK-=KU$(`#DKxBkw3lB z=0NFkLiG5toF`qwM=Q^3e>*D-7V8}`-pQ|29uyR})jbe>tNpeB9HTI7rSKBWu$k_^ zzA}@wB2hGcLc?Y(qZkDnyqEVon@c-uD{J%SK67A%oOPdb+320=~A_y zVHnOKAwZE#Fjg{0vqqznI_1{dBi9C^r@rmK@PmBh%_+9m)!09E)0qrH3Pgq?LRh5J zi*R?+qiF+9poKMwq5c|O+e=Jxw2rE`L9uInOd;Jk5-)HMT}8~bf1chM9DA5!L`ToO z6Q{9dqR(xjS@M=I!tA3Kz}^xkxQSE^Rf2^ye{&1ws%hSLagO5e_NAZZuA!SAr_( zhy=b(rY!UP(ZCr+rkqPaU@m8^A{Jt8T`Ws&YPG(+*M^6OcLv-n70t}(7hLA7Pdw@r zBm&mD$vi_%hSG}V3kwQ@H9zoZO#2uDKb4*B(NSf>fFiQt4_|PO%U`C&#?oZ{01yb* z)&!TXD6h+N4Vh+;At1<`Ry1vT?Ev1-4DPd-$(9h~i3}`^pMkikO*prU)JTiR=hAl0 z@Q1!H)uCS0E{S&wSD@~KqLZ)tSdgxT3AHP@G)AUdLEKPM1$+YS)xLOplh1x{uZ_Kn z345}2REEhwR@oLbn)ZqEx+1JM?v}FAUsEJfw%m1yY+M2obyJ?X*OeLA9cM!_%}+{0 zl!n8~s#}|Cu{)rx+5v$MXDdB6j$xnEqSssLGBnQakijrYGY#81Bv6VE!G6+eH=)kY z;4@WMC$O4eOAGs3Z_1zM;{G%RyULhJpLZDe>vhnpq=|1OthyD)nH|6#uBLxw7 zm1oqWLyEnBXsD^D7BsPpd^&H-DMtvHR zzmeU}T-Y5kj8(?m#ChzIr*b%8#1YTT#vuGveDg1Ra#fM;X6m0$ltknJ+@jRIP#R10 z4?5QqmY9a&-cLlXd_U)QE@;-)Z#0X7wLibHFz{DL zt6~(#+-@nYgapyslkYPOAGV85fL?xKD#czk<%HIswu3p+RgtRNcq~$3t+|BG@WxTF znx@}V+im|Pt=V%~+}@7UlI{nb7m??o;^3wjlDQ=KSh4hXT47D{Z@l7E@@m>oR8_Is}z8=C1Rn1 zrh6aO3LMfqn>!0@RYEYkRH8qcyVGh4mOCK$)Zf!`VNN&Os!WJJgS z;=BFXir3Hb0w7gXxfkdmVVR>GA8&B(uvz?9nIkb_(2p{ao41RsLSC}(onK0jI+7a( z(D8(364(YS-60~z$sz51PR=E|}Mz|@T!h2MD&Wt0_3?ZehS zxncsegml+m%MSonWB^74M>?%oDXbH0XcUi;HvUui$I4J`n zM{8PsK~CQe5f;=diMLPhxNalR0w!HrR1%9yc=cfDv-6EA2f&HzVG(VQd}lZ%3{GzX zZvfpvNja+mSztZ!CZI*q)5e)8w;Mp@K>y9mTWM#tBi?IAv5||)?mE*zx`Tr#9-lwM zN_V~%Xz5xSX+rP`rX!zW!sLlqOJl40^c07zjUDGce=o_~TCgI_qV`q;a#m9_ag3hP z^LYws7n>KaXdbJwKpYd{{?g=@q5hkk<%-cX{(KKp6e?pt*Lx%5}Lf&20+Qu3n>h zUq{F(McuM?8eAMwe}C!bB98rhCz=6j){l*y#H(o z(9_e7Ww2QaG{n&s=v}aKX^=&z{X*80r}#lN%M~r$7m~)$WS9P91PYOm2&)gd?fjeR zMgYf0tMw9C;P@!~#`9UOOeWfm*6!@ouuTrW*h9zawwx@A^l3*!$G6e<{;Os@wO1%C zf$wp8ni@XfvGx6%6^znn=SSWaDDHU#L|oIreUfMe{f{M{sw=wm-_K>>Cv9%v&c;|;8LMWC*14@$R2H=R+4h=?%#$^Dr?dMm(wqFt z5A~hqc5hgpjUR|(e^5r%X`%F!NLi&sDEX#Lzg)p%_1{?iV%!mQJD4sgE8gSV zYFF&TRqstEfr!o&3c$yj$~iNQnV*jX8(cAU=qz$AZL$lDeF;VytQFEzn$K8qxsKnW zq42QfpB(N6x%=|ZL1wU(y304UYCWUxe*t>9_?8lz-sAS}=|e63LZat7-&CO4`UZ|^ z`Uj<^u&}wR@sOdY9JcCH{tXZ2D2W%%pAji>!jcV^z5s0Ge7W=IIN<68lRy;}u17W{ zC$E3x+lyRn&E;cD8JKKIP|jWLcuSJ|Ni$1 zR*eZ8^Kgn^{CQn=Uu)zo5E*KX%9_HV)0#BTN)VY?Sc@xZufW$!sqY}h6O1|eD5Wku zI>dgwz-K!#?iJ-5PigH(!Y#FzWr4WGn5#y*jVK?5ha%I3AnAZ*HX1*vS?F}qfh4>S zW?pw;<~S#4chsppR~q(O)IfJM=>JL{Z}qxAkjnPPBv08y+HVJSfO5Y#iN8Q#;*~85 z5{k^Xo}<_HRIgO=Tr@u{R6a!Dy77ISpUDjDvLaS`buFZ)Wmqj!BX7WFn6?}Iym2o$ zpZh>iS!Ztg`(5_MYyGKnzZw+rk|{bSC|bW~2*yawf@wJM#ZP9oL~dmcttyMcCi9=G zH9xlQ_MEU%=Xq2D*GzFlh%Pn zdsemDrF$nNZq$6F;~a-pHbS@&Oep6usN_!bR_qq? zMUg0=ukcb)ec|1ioI?+31(~CIEFSJtI08p+@^0QDZ=i0W>tBzgY-le(L#e`=Zpn** zsnyVwImSpS>)`&~pZS&H-aE3k8tMCsej??}O&V{cH@A;r1w)%TQ>Y)NK@1^JE+8f) zWW;uHCh)WI|GB+z6F$6h!d+_e1;;~R!H|-^R-*tj)Y9DYGv;F}=DSKyML*P=XT-^# zy4Ap=GSSJ|b>!`AeO||(+ZlPIo=;W`aTTdg=!pOH7{N}s)HGkNb8G#O!d()Q4({1Y z9l+YK;%et<Rf~l^k)~Zb)sfC}a_vj;q<&DI++d zHn43UnY8Q-!!%~~S|E@mW9@XsUt!T*%h?Rgq{X_sjn41(>Gg{KrPEQn8q{QPBaFph zec5PY1FIo1O^)1Nd)iNoz{SPIQJ&4Ruh@NoT1BQO;q;;0kYKi%{AyjtYPja_utjE6 z&eLvsNB%3CwCYe!!I_#ooRRk8BzNmvl!I`3Gvl3DH2@9e+>ku zw23j3&45!S|22RxB!gK)DB!#-AQ8vwudj1+X&G5qa)9&2 zFnC^^j>>XIKi)S+LO5*BdAxxPoXA?Z&#Tr#G@zO>~Vy$(qjc7vXXPp?n;>VKZkc#QlYj=a`E52f|!?`KZJ1Zo^n!AQ9vEFLfTV291A^AqmUjj>{TjD*6m8R z=(hH}@#ghZEq*sC7A!1!ZNa{q^lT&9n#>|@qeK70l?*;XP3UxmaY)XNp1sXtle^RX z)uw>Z6f(J2#Pk=!PG_W@-QAo|I4@srMY`VVxUO%(J9j*!%S!)ry$=Y{0iftt&{QcC zf_}k-P6hSM8k%Bb?o5@}EHrCtV7UWM=Br_jnN@-CQqcxEzMGchB%4=O(x#@0tF)U@ zWnS67ln-6#7Mr!i0Q6)he67c&mE;Lm27ys-AE#GLDsZ|l@+qc?h9-`*j>$u!Keur}ZbEepm6)tIPOHK8kW#LOfQ8$7Mw zflW-?cl7nZjQM#TLW6#>HTTWbO+}Fsvp<1$I_m1;K|!WCIy&y_v1dR9qn$z);zE*| z6NAoL_X#B62Zlzyn?8M#-JK$caY9|g>gvbGt;TL7$)B>1DcndbpW!Zi7Nw(W^Ru@0 z{vu0~A5K{;k#bZy_+2GTu1c2rv!VT1T05ngUTc~~`%iI-O9zOxv2$_g?YjNr+q)cc z=TOOtZl&7W&b08|xuYXt5uYx<9VyPqm$@6Ay*q=1mcqePs>Gh*$AUx2ybwqi@~A?q zO+?ViZ6}v5hqeEXzEy!ji(XCovV$1JoegVbvA2EJI0o};;z$(N9MZRQmYMK`DgVwh zIKSq2?^76g(ba5(h9#n*>1x-(j)nqyj!S-o^&qtS`A`__{BM8f-_LN?#sq2EzKBrz za3?Ujv|I=(9W41_<%9ciC&BjjZ)BMvY%K}ku{Qx^&ykiys&;l?!gYu%Xi*%*00f<9 z+#jcvLUKcRu{Wbqe0uZ?V+^z*c?YgF&ke~iVY&}5z&`BD7o=jJhbCKf5QHe*VPj|RP1{K|{xzQy%6yl6Ffm|T2UWa#DB zh$ki9IWcTLdkpS(p27FnQ0gN+EG(~qv|N`7HSE>>X~$1!!4Pa#rQGl#9Bu}Rv)_m- zyB3*t72QofT04oUo*2Ui?e8eJ_4K7fL$F+4ncgTfrOtd9ai4CpVyH^T=Q+EY4VH%W z1zN2&^a+<50s?3`k~+Y{yGEL{33TY6KCxkg?-EiNYzY_^JGkuOAY(7f&@La$sp?8d zNl7_f9vOt!OH240rU}yBXt#LPH^G2jo!_p_3*V`v%BUCnOZKOMHPGBg?hCAmV8Rw! zcfgM!5-Pt>r8pl9+*bN3Yh(=l!`>=;@q{4eGSHHA^4pk+teUL>BG+I-Q4;vbtJH=6 z^yUm0CS>b0JkPI*GaFQWAdn|y6{>uwb!$Ew;#QLgF-$o+6pO@ESW0)RkWA@L8(d=6 zq}t~R?KWUSuc=zj46I9b!I!pAZH6Tx&@Bk)xOB};Cgtkn?yZXat$r z==GZ%<3^_E5CC+yD`*!#B}L-6IOlnFt943>i(EC=h;z5WHbFN4#Z zb!~4qb(lAbz)1rs9*l2fX>X=-ef%oqrBduNGcrVl zkPFf90{XdMku*MI;eYyB9<=MSDXmsG6R4$YKkqnle)~70uK$Cn!reZOxJ~3PD2qBj z-P&sU$DsLWl?N19TG#e1vwH?Y9I{TYQFa5rP+Bx}B$`i;yL4obP%L+ih=EBf;ep_Y z+u}f8ZdPgCwY9$Yb{((j=62JgS));BcZ``2yj6fs66*ggqBW4!gd#cMBCre-LPY$l zOBm&oXdVUNd&ii$SaFd=MGXO(!S!^OuwhMh^Pi9k&{?tZ$%%)D*Q;L@Lxx>YpirTM3Hx4XbS0L&;TsMl}jCU1eosqNh%2^+vC1tr?aZR`DCHn_}I zZt`!=my|b002!&;j(SBh=imE78WWb<{;dn_i785%;>>ITj#9naXdKYr2vIgv`>`hR zi%phwlTiZjx9iz9leY^&$-KpA+E2YBPTJ==8J`|q1GT4Tvhq+BBrzd;pZq*NZf0R| z@#&qT@(%^|0?YwdC8+UpUFMfN+8XqC41P6~*i7AW%@vwt^d;#}KFBj`dXV zJcaaJKvHZ!E@HzSO5zI9^W5hOuV1ZQ^ynAuxZBDK&nBp3!THm`GjI2$h=_zlwzIP{ zCLv+Egh9WQG9@oJcl2PPk%5`HNLNT-pbhZ~M?uiwq++_nKR`A|lviD{?f|d7e0Mt2 z6myFi>GHufr%Vp#u#$~c0^RY?UgQZ5hlS=3!|m9)p# z@C&s!y4nuEP_p1cg&=rZTqr3isp{(PpPf-sQksD$C}UDn_nq|LJGG4rZ>GAEmGAp2 zv;KeVeRWh;>(;LzB8Uht@zNnmC?(R3gea1VbR!*tbc2F`2#6qE0*XpYhm@p}l9JM0 zf`H_m3->!XRYU%bN=RUR^{?_3dXPc<~gLcozA2oX!e)) zWuKGZ*qrOkjEyBPzQWvC`QtI3bVlcug|&mJaPco(edm%KPt!K9>boMOXc+27Eg23u zUNCjz5aeR$#4Fw`dBni`CO@dEW{YLzyN`6Q#oY@Do(fAS+0PCR52TlGMx0j)sdvVk z3!{@f8?F{~+1_J$^*T4N-_xzmwD#Wts;ZNFK~or?zP)qlB+)A8(wE+AI@dXY6*sc_ zqkHPn>g&hs`{IQ*{+AkESK59VT8TQW*i$1&$1@Z>cYSRY9Rjd6l@MrgCbJ-*lEs^d zy=0F>u z$`UzzEGA~QT2OuRnx2{^%`;u5*Ye}~S<}2p-%~yjJb7`Do{mngoH0eo)AA}aHB3pn zFL!3i1fM6v-D{dd?eFIfR}8Fj5Ld|dlo=kHkNaIRIx8Y2WXRhB)EYPrC zt$@sL7GcrgJ=4rD(pG_A!+NwX{`g5Ov=`9Wg~vp|y)c)h-#KOz)+MSxsGslD^d+=% z*@lB?_gq%Ul}r9MH}TmujcfV>KAlb$w}>Sf5|pb{H@yxzf)q74Q(w*S>RaTLA(bJg za_?E>kwanQ`*AK;0MeazL~;fy7#~&|$OxF`jUmY}o&`TU=3^j(_|}?puv>5hdY62j?;^w!dhUB=qpO#TompeN>()B=*xAi|uDDeW)YaREn2M&^X$`eB%1QG9sxUfStO^rMm#} z^pJ+?SGXaQEu-%g4Y( zG?76xET+)h4igbbHh?(RsPMJq2NS!Hvy=%gTBi#qk@{OfKS(F*f}%R>kNVqBj{wc| zIs9SZM72kopKimGT6L0$KK5nB&G*&N^^n^=?hnQpGNlNhq$FZXbG_uhW|PqM?oCUJ z7jJ3Nxth@&RqeS%^K3iAjo42f5b7}hyzTX2ZleoBGb-YxM2MOVV^~?=M;6>Uz7;l- z;JsxfJ2FZlX=*1OpAQj8^2!QBQ6k+=qgdCR5~bYH(a|*XBYHayEIs2A%)eZwc~>Ob z;EjLnG;GTnlE^MR(7vEq_@iN)xv0|iO3aR{}(NX=;rm>gn ziNC;*Tut_%Bz~v@j|hLlH)#gZvAoV>xi7G8qZVEB(BSEh{o{e@@Mt*g@~=5z;9S=U8utBcM1 z?Cj@|gxYAF6pAfQea7KepZfm#tEq1~GC%Ts4p|RpIDHDE-t?N@MVi^`{hd_(EZPc82yd^1r%Zg;+2*aK_%jG`6+#%59Q_M zjjLRDiHt2W48G7d+!?3ZB2QvD_e{rToGm*|=hoI$x$tsB;iZ*)a*nuiLcmFQE? zV94-{70jI?C82p#@wK-0)WzuS%y%tzm3xzJy*MgSXVlcAtGwfSx!G@DZWxwQ$V{*p z$@eZ=t2G>|$Qoq)d|@%+%F9m1+lKLHEM<|jmWy_oD(Qsb13}Z?q_4v8mPCmva6`hV zPj|=8VL?lPS2w5-g=EJLeoF34$QY!Ddd7gHkLzmR^*8ob{TZnw_WCPFrF`hqjft;K zZ=47b|L!aSQf$G+fTM9qS>us^$@E&OTKnu~-3>n;n`MJ@Xf-@J=`8k87t-jLtW}=B zn!_L_jax7II8V?=|B9}FTbcY(sofhZEGxw}ev>VFQ~=Flltj*;b$)G!!+1fa^d9e$ ze!?3Y^p+-%lESllD_6Fbqy+ji{cMo6uAhgu`k!O`vlOiL8#bk*HzCKapq%FFW8X_4 z5W3h8*)M}N35V2JT+ z^8KX@{c9dntYQtz7d@*!Hcfc5^Pb1YeHr{yLt4zW=rJf7qSpC%c3v(wj@KSMq2#?G zj~jUHiX1oUu3x>%o-S1IbXHFF0o3y=M%~`V^uHfSWN52HS6{$|T7ClVvP3f+?U4;D zUkaoigBf1uSh+1=4~vEA2KRNMFR~6S^l3sDX5G$BcSK_G30KFJ3Pn1 zjK*QpuZjE;o>HXRIN^}K>3%QVU!!n^6cs6>jFy^XAHDS}ld`>kV6nlCIPi=4{F=8W z>+ArN9Ual*)p&M~BFg$yI`l(9%u3D6dnOwtH(Y5Sjuz_&wKU@|Y7XJz;lHh`-)6Xd zx!BlNC(7J*H9V4{zdt3dUybHB?wZkdT-XBTjYNa+kb14v*xj@B1-L8%K--?eqyd0x zf0pyqQ$z`AA!pxtAzMiYo(#s)RQ#Sz$lGY~~qRpv6auHJCvSDF- z_BO$6-fZ5>`@n9bx2!7ztV{``8d2tgz?vBdU36Hw{nOX zejj~HH)@l!hu1AQSGM#oQZMz?@*xC~ zD>m#@a>0ezGoYPzkTrW*nQlJUWvAxnRU@x@^L5?<$ngyVRtD-{i{__XmAv!i`10Go{TGvu+b&M3pZ1;A4Cjt=sx=l zSz0gq(kh?cOG$2Ja2HUMK8t6f85P~H_JOn&BOyV1hO4C1U_#@{c2KPxm51I%>R~B7 zClGjvT2aJ!1ajX5Xtgei7>|bveXNh(*tO|bAUz$7?AL*u@w3amyR#CC1m&O^g-HSj zQGJ0zY~lw{BUe1l&tt|D*#+`S+om{vX}!f4%Rj7=pSk}2DU+LLRDRJdbTk@I&H8Qp z&^>G?-2w;5=4;SvhV<4uA!n17OyxuvnVmC<@ADSP54sc;6$yztTHbY`gaU@l1nrD) z9&Ne#n6_S5%5UP@$1f^YEMIHT%(r2K(q;G`l`aD{v`mY+d-txjrSTm`#l3r4T5{0L z?>8BwwgtpL>@8ob-VYjAFCoEcE>V7zd%@p$Zud1Wj`yIw=Prm zGE2E%$Qyj2MEa_4r6Nw?(J80!g`rhfS!z%Ks9r86e>0V}`64yzWu2XK6N66H-P|~XQa@ZxLZ9yyXNF#KdevYj zHw?^;#auFxiF!3tlS~hU1`$W&xNz?RnK8;6BGE#u=#mvgc*>tCifM0B-F%JZpL1YdU6vAUYg##sSiE-T|Pee*Nta%A*9FCoi*Wm-h5tQ3#j0>HlrZ-tA-O|kb{J^D-!7>g{_UhV*wqyW<`;ZO=-?fl$k;H9Dt1qSi zxnc=ppA^!BfDCIlJc;}luwHfS4-H|V2ZG~fJHM6t4e))Kw<`~Kr@|wsmF|5U8>WB+ z^gJ5)#@!r5Zy@h0ql6Zp;~X{jl-oRPC?hwN>Q>+0-~6edsy$%v=(+jDYDF6EAj?I? z?)q2S!!nEs)(__DCd$7@^H=LxO2Fjr{k3}9jTy}C& z5G^>AUWhQ3JQ%amkr}1y<$v7ZaJDP zAY_dH%IAey(>?FkGY>pJW1EKD8gZQv_q?@>mk=iJ?3jCdgNw)XmcF5={WEmV5JP1D z_hB+^dFf!BsUNAQuF#j_SmC29mSnIBjau#J&Cld~{}vsjUgHyXYxbRGNvpv`a(b-j zdq`!-=&RL1gdi0mDyoQ2067=vQJSe)(i|OjYHt6A25BhBxC&LtuCis$FChZ(bwG8E zFib(Ej*D=f95YLLvIOdOIcPi&^1VqomuatF&9Yb+qCqzs2#wkxI{1orwm+ zRdGF*46uh8Xqz(bJh5CF(E1dx!lc)86E>t1yTCcW)qeP?kFhX`%NQG<;<|sb+bl)Q zR=DCPq*`bvkM!?T5>^ZBSdnFqtwUy#7%iHTjd;plw(;y&^PUcdEUzZq%D@6i>)DTb zyEJrHg$;{C-<(6$>lq!ady}Av4V??qL*FaMTD{;r~~>_k=*~7H8G^XV7{1>ez z)whV6)$npzPP@NRSxdh1ajE|0r6NY38imjKISr7p->NhyxJ-O|)_M7geaoi6CC21O zGLI`IP^~)3j>_A{Jy_K5c=BC@etnUT?x*u}k7!KhAZLv^H=%LSPyGaq{*{}3jTnCV zD{1>{?)R(p_NuBRNP#AjnWFBYij856DSmvw?FIcwngVCUd{mC0oUQl2_l~)-bV!}r zwh4Qh&NG!*8ibt zS9&&JE&q+bQ~zG!%mE}EW*irk;xslvC&5HWJrC!!DiU^gO@}|h zGzX_yN06T07fE>qwz181uG0!^OyBwss-jX7dr z#-lW9Y&c(mG1T-K>7G_O{a)KKV>z<1_rs!)7I=la#`$v_snrxw5sCD}ynIQ~40bcE zT2N*!i?5&{d5eicg?f}X(WpTw+<&!~mqE2(9Lgpdb!a`hKjb)v+9;V+D^}}0SAwy+1$}_A3+GfGZxn!9Ihv%(z}!a~PWJ`SW%{ z{0)H;+{k;cKvx71gx-*Fz+_u2DJuE)^Ve_3v=28Uj{>*WZpXb3#`Y?H##Mc2lB0n7 zjE&!@7)9t`_>G8#_evP%!C|Np)iNuphRS6*omkQ!Sx87-T6Dh~+{t&7(#1D)bDVEp zSe3aFJX*Y%E4AlmH*#^1i{-?=Ba8wAZ6X$8VGuSqL1yZZ`RC@Q1+;JJ9UL@Y|E!Ot zc}f}s!vnW>V1PK})mypSw{b%$dF4dUOh5$pBR)isVEE*XT0_jcBDNN9`Vw63{kHE( zi79dHh$eAkqs+VEkD4{jBRyT((f)zJ8EovkWGaQ_Zy@lo zG}HORTl3L!PUUtFH0OCpO~3ykSCla=tQ042n(v`mW-|2D8XU zcFV{-#Z{W54iT%#{drGK;K5s_t`|p|s@mfcD8q{@4I`qBf#&Vtk{!Asj;{4!YqOVCjO3Ooxp-G^}gGJIU=^BOucqzK-` zf<>_K5jaV@j;cSBzFZ`HGgFCQ+nSb`^oD4FK|(aOk~g~Ho=ohKNh3OLxx*;)LgIso zXyRU{P#o^>TID6|mCxR-*Q!K1Eep1EcM?C{VXpSpX94_TNNSG{7PT4})`GkomRw+4 zYITu2hwwCEV9+6z#fNz3|AIDycO+1n+WuaR_7fhSlMnFk5A^nc`JvFHBBGWa^*8u( zdxGS(nuW0q+qR0{N7Kq;OXrleKQRNgl6$jdSJ-}#McP-a(l#G0DPFgg-@V*1obQ#gB~xNno4qAX*~L0)`BKbdZ_B>|Ad(^J zwX?9fgx8P*{Nn63+)tub4Q^w@7~l@C1PxlGB&eB@8Fm1RRp<;x9yHlvk5LfDdk;yF zJ)2?Aaoz-9!dA{-rzk01*AdX9+ZT$V%ezkHM}2v(<5n1PG<7hEZN2dKF?-PhF@Iwv z9nv;j&`yPjd;EPIqs$xbE(2Pb{}Czjw%B>24k=c~g9U$X@mSz_~kNXo_-bG|}Z{7KRLsHm9wn z&1z0koaXAH_%bxyM;|dBf4^c|yoZ4 z!&;aD%8A3mN!RZZ@en(_`c2+hH)22KK2{Cv$gn!$CgKGnV&Y|g-(CUU*eX5TgeHm$ z8^;mzWV>4a-Y$MZ7IE*Fuu6zVM)rRD_RaQ-uD>6b2gQ!OLTYb@z|WnX9(f*Hehp?3 zu;0WSLL5lq3ZloicNy!kK&y*HfgUH&C6~v;szN}>P9FJR2K$EsmOy=JN%ak;9Ak{Q zGgCqN%8Qf~H;=r%f8G_kLy|8%fdAJaLeLlTx5EOj(I@B&cEVa73yJKS;vP)V9jD6D zx;0F@q3IM;#B!3S++j$f+D^f1Q9%1c+%nOFrLH8+|2B?}{0mO}|u8rnc zgKV=%HB2^aap;-v2-hpgsy6wGu7CjA{QAV_l@49^g*OPH0d%XC%Z0N-bQrFnoZA0Z zPOyF}kX!9b{+lN^@kX{E$!~z0AiP3#L720CBTNmq7%H;39wNkl;oP~j`g%#Q1S?AgcpRsll%N1D1lR6Ie3+X!b0=>7A)=^G zv?JjN|w)>?wm& zCFSLsdm~4%JwHe41Mn=Mp}nQ3wIjl6j@GRH;}MYlQvcXeLOXGIpp)<+P>D}q&&D-m zD|08I(3XUXigh?&rV)W{-QC^TUimUIdV0z3%go`;*2IL7@$zM5Ma2t9>hKS0 zXV^O=DCF>uhmfeHgirc$CPWaSC`*AyI`NmlD%^lYJj_nlRWE=fmiX%?=ME7mfw|Z< zy2H?3V!_FCZ_C1MHQeXm(kCd3Tv^<}C@C5II7U24O#vax!_ETiFCApK?Co}Jez4;Zc&l^I>pDstbFgD6gxXR2@w&44-efb8bmfv z;S4JHK76~`zEfK(agKnXv{Lk>e*F}Vxl$^WL+u7`?mMmw-QUOPx`h1pE{osse$8<@ zG|#^IsiLXr=DBm{?(T@1q=&5c_PKp|PzAZ>Vbrf_Mcuo22Xv$$EseJbcjYe+v1eDBU}Gp zzt&YSscX?F)P%h{NcR-uL`&D{vlqHRo=mp5T*gO6IH4bywTn;lt*rY1X& z9NPc<)fJ4~|8j_~zP=JPNlu46)wn97k}QovQ+%hjF4+942`|+opv8>>M zX{hRzf@`FAbty--GP3LD9SI&CP6qjKAd1Dr%xo#8q@rDK=0Ny;6i8tl5fM?8Zk0Q| zA0$-YXChkZ&!H+f>uPp2cURYuSG?NZ9g@3@@{R-Pya)R}Yc3y`&m~8eV?hVd;D?J9 z73*fiJ~!>l!FhP|=~c9Gu13M*UDHcs-ruf>ijFRVzO;cl-b{VDwO|$IGn1C5RQsl zw!?c$pg2AOrEU2Onq+ik+>Y$^vUa8QgBhR!{dxMF%>Cc1=gKBsy8BqGY)6H(%IscK z&q94}l+v3@cX@|HA7F(W;z?sP3r$s^9YpzucxrL?Ef&}#kB@l~mt?;|(`4QS^@V8= zW*)N+>hgq`7+G%9Hf2S@g9_+dytlWMXX2qPwpK2tpLrL$)u)v^r*EzFvU*Ubo8OT* zh{#S5al7szOLeeb0FnL@DtS-8eyC)m=p|KY7K$Lx)77O0xzu$JhU||<>9p7~*c;bG1AL{)Eh8AS4{C3gs4E`n_ zYC~}OMQkj~KsF4erxg$$P;4cw3CT?=P-MoED%WH9oskvzaj2&3J=5HE_bu}*r{Z_? z{*Qib%6Ou>((XJFYq+Omc|1v=^u!@-^EwGM{_wGa&!WG2?=2RGP~D^vrl67~!+a@Z7EwFgq?Y@h z??DTpAlXH^GSgG(ddHh7$L|J?z|>JCIeDuG;aDC+ZCJ%7%`@7gy@8q!UP7C0oNVYm z?*@E@$~h*?`@P!--w$9W+V|13lw~os5QEIj)im2ro_aPeqnyRcdb@qjo7~ zd#qJ>j=%^-lRm{C9@JlO>&bW#f#}u?#gX!eUl%F08v-KLKqJ^zTAZF3{6f_zt0*O4 z>rrFnMBLo>5D%x>X6ektV6_XSzwy4So1_hVP_slBw&KGNyQ8wvw{QqVfUqee=ga7PwpZyl^SEGSK_ zZRSr2+KpE7R6Q4fQa9EL6H3Cci`2q|o(g36pOmcGWF!aqp>r!rQz_hdl zXzAbxv(+kK{2x`KgX0`zIz*-HKqt5gnENL%6nY>s+i7y-IooOBf%kIi`za{7}y=G)-`Jsa}_?m7(1!5%6>nwaP%v^<$=(*(evtqxWk{*k5 z88zS}3=7;tUQ!p7a<-;G^=5xuu6&7Zk$D7%prB^Vix;W;(~(N@{RKOzYAHK_AtHP_ z9lE|jnwsqzk0ZNhSV_QR$LR-YgEt0pwD}P;VB482x=K^^MAo5kEiH?4 zAw68|WeUv!e=kVs;^G9`_Aw+6emQ2=IBh3u`wA@MZ+`CMl>GLpC>|-urn8oTyx!Uk+OhBd1Sjo`kkm&fz2byyevHoQltH!4hgeOsZh*f=j zaL z#^)Cp&2lXYN4jFV&&HW0uHw-kP*4%j=SsFR@d*_Dmtri!{3H;SVa?Yk0W4Qm5M;`{ z5!mBkI-5BLPjmCG)Ek#Y{2?@>PrDC^GqMa61PUo)x~qkvGP=4nFoHZ9fGW}rWD636 zBGo$YXy+8J|M#I&TMWkXPsNn>#TeagpXDW%fm$zaK@4I%E{I z$SFbt(Q9tra2$D{-&;Ho=U$Lwv*K?S8EQueAu(>`kEMdckR|{7A8L)ki|eDRJrKCP;!@Ig8N$i23erP`(YgM>*n=&#{Wu3;G#Qy` z^dUD_%_}^V6~+_9AfE9TPHD(ryCxRu_2>Hk{dc=7$f23kh4F=<6x>e-n_Z!M6_ZCb zz@LTU0ljq4Lp}M1pXK{Ci_B3934EcFl9JHt`X1BL4|>G$_;+7rNLD0%>mqaW{+#&n zZJ5GtCLBHq-I#)qPJV8y(2XgkwR^5DPReD(KJfW1TU%Q`=xK>~%m41H_CE6T^3k7? zo;`c^An(eRD^UpvtfC>DkYMYUosr@Ht*2*(Z~xtxV~;}bn>%DqXbAfA*iT9ifuGmd z+)TWG^Yy=5F7mmrFtYuz+kdx>x{-&BdpLxODCKbI{uqgj<=X)dP5~Miyv@(QhSaz* z-icO*-Y+7}!Do(OJ3g%Zm8ADyy^51h5aLP0_zT+}3_^_)*yoP{wLMhn!pX?Q^x=|y z#zs!**oW1GVS|(h+jo#0fdA@Y{r!mxLWp4wg@%T10Xn}67+BY_Tjm3Az;#tCD^^{H zcHXV;W+_(!hdv=9*$}*?{BPeYyzg2>B~>Qu#%(wX(HsKJ(9n<8F12MuCm0(UnFHJ4 zh&11VrYXYarsU{zo&`8%U;YEG%`S5$hU z5ucngeI`3dSN{faN12F-h$0v?^5yH-$e2o~>ZatkrM|DOerLlKjA-?9i~;PE1>H6P zHoy^jp(<-?Qe08;hr74Dsn3A5cp?NSovwxsP?ykewWXyc72yEDp<+Gr;E=)_7&9K| zrRO#e7~-u}-Y2zVo{~QYK}VXNi%aDk5maGZ6Wtkcz)lx|?r+!YLv8%QOa-s7{&j|R zlnh|v=oUC$#K)m+CAKhLi#?Q9RO4lY5r;C zYV<#dFga@uu8YD5&)!TWIt#{L06cQHZoNSI9pFB4Jh>+m><3@)nG=O3;ei==fhG+p zIdOymdUs*QIX?Dakp(e6jgTtsK9r;_Jf*U|CJ;{ZM`8vDe@MSM$Y_40uJ@e-m0WFy zIIasm%ki42Cy~xm5WzesM@&d82~eIT1`kcDJ(5)|qpomb_0JznsoPN6iBjIN#ND_aj*0xHX!m;bQ@Q%J z^O)P*?L5Ph5IYyQy@kw$VCLl1`FV2-GZYEQ^y(D@5#;&of@(LWnil{wMwasoa;!k3 z-AC_Iph~YBAeRi7tQRRy*d{^QrXT@O{!z!s15d|y#==y9^@+(pAenoM-Cp!AN5W%o=irGJ+mxICnAfa$HWvMlejL?SaNM z?>~OLyH{r4OLbCq=z7|Z*Q)=PkMoh|O#RU^^(QN(iD?IyQKy(j=X-DEk*ZDKyZ7#` zEDSAFEN8Y3?EZK{@C}c%>W8Dx{E5LDxnTgU^1h)JZZq*i&y%4lw?bX1f!^M(HSZl^ z3u41uLu+fcPsd05pr*;+RGA=P}e}aV%ef48up#?O*-5RpW>xS`!GlVcQJW}N$b@1|syBZp|*rH*u zr~We(RQqiqDZ7GmiBlggc5@;!VSY39%^Mk}&?)ng*;&(KeUAX+BkJj(NoZyERZQUO zD%4P>gS#Sx&>v9FRAe_H>Bhm$9U0A`yU4c@D;mnIlq@gmzLg4CT?98R_c|XOIgwYV zd}zg?Q_~g_=GD^H?z0$Tr5@<(>wZqb9dyQ=<6!6KJta{Y=oCPYj{4pdn9}lN0}~T- z9+-?5YSxRs?o@2MLs|m3$>k7Sqo$xnl6Nu4hzY}O(od|iED@ppkQ0nf`4$|0W7HSp z^fc?k&2Wf*B|F@^7pW*BmQzB;5TGb>*QTqwq(qG$dfgRww8e4bcd!aGq!%g<-WuGx z#E|jIbVaWb8ue%}Xh|_kTPL}aC^H|mJX9w;d!j!hah6BQ?=jovw{~8e>NJIvJl^|; zREcpwk1>hM_s2vEom;YEbVfSmciKJ!L9PPl<`Sif~nb+?M8(=G@XIs zS!C>5y{jS@PS&1@B1BsYp~x?abIR14=vF`~Uh z0)2NqBg4?0DbPN z-lH$Fw9NDW@(sWbqxYVUkY($ouYeTW1;}@a9`NWC-OROoYf6eN|cyi_AP+J z=*aLosm@;e5;Q(h52~}&o}ja~50gw(The>f%7|XFbi2EIHEZf<{P-{>S~vY98fF(? z3x@c;_stEn!j?yQ#F8HsH5ACHOPMD%g3eoNhpiTGH8diOZ$D|cymt1|a$^UCQo6y)V^y!;Ag+}#kOgau)#8*w*o(aMr>pI?oG^6@NwKhRx%))oEx>L_I_w!1R zb3Rr)bW!RY@e_7gRqs5VbhWx~)L}pCUZ&}n#F>djQE-{)$U-InoVVzH*jGb0JIQ`H zFC&2^ol1gnlZaFr`IjNi2*B!yU$4-|sAnx!WY0sE)UxajfGXY-H~;-)57ZI6wH3>c zy97tKlE^Xa>$TLHostZnQk*-!l|ueYg@S<|lYx*ZzmPqR!Yq-?TXy2D5x74aGg<-0 zV=K<4(EChVztzy5eLLr&F9I=O z$rbvrra#mleYpZZ@Sy#`l7B^>{7`zFVlGadf5;)8+T3z+b`>sa>Q!0%1f zI*366;OsGdj`-xcOMMp2TBVKm@>|31=R;7dLue-3`s6MB!qOmI0EaA0_wd@A*jhp1 z!5MDOr4N(Q+P1gZG8~~VR(vSmP|IB8SeQ-d^)A@ge=0uzN(4?&Z3B~XmfZ-Tl{HqkFIO^ooa-(bXMUSe z8`r9fi!)39t}p(>_jDV03@hQhX^fcWj9(&OX@s6Q7e&_(@L#?l6l~ZhV{6%xaQqU|@o$6p=bF6Xa z5zEuaaiE&a2}^h2xM+P;r$m++JC9|F3d@69+rl12;olW%l)aZGv8k*L5UNZ7-$zQD z%%KUNWAFK5{-ues3xux%aA50tB_!(Y@w=j=v7`+qiQnCaCh%{OPW-v+Rhu1F%Ug@} zUxwYN-hv)1sFjySPgjCNi`NrX9`uS9Z|$NGeQd_*8Hi}S#J)qYRMVqvn=!SZfdzX= z87z@g&lSP~(7m4jOvohE`*zTotr*}Qv~hPf#KcBGlq}_eOx7N?NnXX`hbF~xzJB(* zMkOI%m*K|T8e`ureym27Ag8^n3wbaAvvv*uhW4EiNNWoCZ@XDj7+{%4mHCItCJCM7 z2>+0KU!!G|b9@d3-v6q67zVaM>!Zra%OUi;ihD~Smnb$Z#22Rs-%rcmctls8yPc6$3qCEK<)X~H9WR_Uqd>3q4WJaAHM@fICZbT z^JCqf#*LHbOoHd!FbZKeg)#sd{_vfPa3ugq1?4Lx!=60ZL#A;iXdj)gVZY`mD%)H^ zVH<@AVz1U49ClJwc10CaIq%gNFf@Hf8sg*5>Mz5PpnZ*;9uJfD_zS zx#e$#vxlR{&wC5wK>ll<%bHp#W)vJrM3|W{U>LiG<6Y1%-8j7r)W)KwWDY+SG|iHn zp6l+m(9Nw3zRGL|pg^vF)3xc7jMe8(@nh=>mo?il@BI}G39+9om!>Sfmo-V?)X`vL zx&W|le{oTYsfDDbzvRXOqQdAEQq`cH+z<&K(R3i*Ih`A-=b%RMQ{7q>UX>EXyp_VO z;}VjYfN`arVPz7fffFf4A#2$z{R#fA*RMFPfpP~kpTuH^A&X3kgN{mH0uSuHMHH>q zmdYl$FqD;qRtKfmo3XH|HP^7nWS8M2Kl{a^=!QI@FU|wP7j}slGMUKH3YXZMs&1Qf zea*?ERW$LGDjr#h(Nwgqv*so!(x#2|?x(5R1o}1_Hq?(=E*7M;DL-$TyWW$rk*Sj> zUc}0G7j?3}3s{z&XDmjFHZ<&=t?5H`?FjB`pc`zRs&)&czGhnzN#<65*pQxVP z0rC_i;lYTgMc|5_gd0i51>6ZPg$k=!@c$A=1@dFxZMrOUSg@Wv^ZGF&%;2FoZ9+qG zH-8f}hk`b>J~#-bA?zjVZsxLxR7t0L5x=k-soqyACiA->m5QDqudxPcjWjC)Z59tw zKMaRD-+c>0tCL?fG!1NCpX>@l*GohkbsXAf4kAeKuIKhfysj;D z%gFgC&H3aWPl(?^JN$<;p!1s}t*q#ps~Lt+@c0-gb8gs~;q#x)p+WsWMRppz*avTu z-7WRyfsH5j9zF=@?Wo7&wAS-E{I$CbI?cKoaHHyZnrA1+RRW|X_Dm==_z5LvIPD2R zR1=w0;5EyU$gx)y0ukE8_Lq+|gpq?lkLg1QOKG~h3;yeR+;b}aj%!`&Bw<2}EDm60 zc3B8S)_v<9X`XDkbgCfEEdkpwZ&g<5)CF7+vC{!EF=G&ajqQAl$1#6qg25n&TtkCU zP0NCD69RsuG6THGL(S(S=Up>Np*C;IYa&_3dm z#<{PGHffHf*T@NSmvmLbKKwLb7+?0 zrADHAqHQFwhA3b)E7_k_by4&0kn8vK%w5&IQ3br6DX={|v1bWrt+z)Ulzg&j{v300 zoZhR{1ax8})|D$IV(CEkjAs`#Q633=+Jv(kYwVQ%;|<`>;Y6RK-wWnKa3YXJ z(IMjM*~~4~8F1qXb{q^rsQlcFBkp*bJ*$96nH1nmb+#$vE94)VtVQ=rP4Lzpkx5?&BlmvHFUG&}Tp zOL5FC8gAk453{VgMPSMgMS-K-iJ+dsEI!wqp$o9S{SN1+hFndLmEv}=(8OzIhc3{{ zGX7Gy$n|RfTYm3UTA3fIiTycSGy$-uT-FfVz?ED@6IaC zF8gzfFeO-D--O^{92kL8ndNbx<59=8u~AN2_w~B73EZ6`g)$;Zu=yv4vXiYnbJ7a2QxtsG8m1yi zB!Bcw4c2Aq((&qh?J}uSD4a9clw*1N;tq9Z$fAta;mF4YptV{&q34(Vfk-z6oh_Vq zXX-%Gj1-9DJ&FBG=ec-Tx>xgn6uA74roHb}Hb|xCfMoj6i1{`~ z*E#Mx{v6>9*kB&7>#ml$<42(E^ei+lAm)uxmh$|tliDuQ4e|bL*j@|ts&^c zM41a=Y8*qdVfy`2Jn<#~ldgspz!J74!mHb0kO$@=vCf4l#4S6&{#=1w-DCP51%6#a z+6op#0n$XAn#52u?R#47wI7eqKaziqqIyR2IWC?cT1qDN<{$)^12y5DwPT#6CdBIw zh?`GC)RUtGCfjwbkcWn7zg1|CDPxq){RpDF4TT%K!tUTfgddNs!o;NSAh7thS4sW| zcYF*S#Mu??VtK$3m>6$2FDlCO_No3f(eZo5fxwm828LSSv2gJtawVUesmgyO`XArT zsSw-$v{H`rL`47b`G5F_-&00FNQRX+NfAfx-+lR&T!i1UDa%s*_rC*RLj!I|Rbr*e z^*{dB|1Rr~%KYEAb#hGp_iX)l)$zV z&+!Mu1xautfam$A4}5^i{`w4o&BUlO(D8_pH1OVAV0Hjh5l^fr@H$v4^DPgNA21e! zRY!YE>>suuS>SsWa5d~5Oa6adfrlGG>m!wx3IDugufz|a+4zw3Qwm5RWK~ z2G<6m8YHlUh+w5O?@RmC{;3gJ=&#w12snJZ%^b&?TULJnnM!B4LKjBL3kAU@8M7+m zss_AsEVzL6Ht~@h0^fHN^Ko=xXvv|y?Drej#sOorh4PP?CoY9$ml*R3Lu{#VZ!ybE zx4)bSi(En*%wm-*+e8@_kIY{_pb%owbMN%g|8o)Vl4}+CM;|zUHz9kXAsGwV80~0n zYeQfjz7Kxf1WxW6v}@vl}Vc?+!>~Uw9{k_Nzec0{&=&I7gmqmr;UE2^X~YbQ%3=OF+SO`jp=;gwlhvJX$UW z#`Y+Qk7kgaJ6Q8Mwo=nD3>a)YkGj1HslY{nOneV$CLkzWTV@L;?PG}Ql|Q-)S6-ri z0}DHQ`Op3$b~-fzcLdC~DVX!E`_w$<%97ICWRH^oWpnGLwk6efUiA(~D08OsetWoQY^52pf$y7KG-R%skf?s08_gZP@0=tZ?qUrT(1-V z`aY`m8A`lhmyc-+=8q!baQ1BSo;b)<+ea{Cuu>=Au=Om&P{PP&<(hj6K-Ro){Bwnz zhb)-69p}47p1b9m#Tq^xWJbc}4+&qAT3!Ov&Sxvf_{YB!0U*z>r2m2O&l-^L%A|e{ z9fHhyBOQ$H2=B*FvRY;yVmFd5OW=TbMlxv43>Hhz6`)&r={(o{46lk}7~<`qe`#ri z2h^`00@;}o4;;|QkFzcn3vWZde%T`!IE(Y6=J==*2JnQ`JuB4n;|rq^=0pE5OSEq) zOR|W;ERG7G-vwma`1l7Ub|#i#Syia;QFJMcoJ`=L1lqROh6^!I=k23CKjpQUP}5x-=qG`o3J z$Xx6o6W(mr3+CW9sbsfMYJP2QIrlQyGsl2V6VEWj>73TI6iqh4V35%48CD%xXO%{C$V3av_6U%sRci=6g1mB* zvD4tVNHUQv>M?0xh=0ViDuSR3QFcnt*r*&TQO2;x{ycc>TrJS|5)uQUFri> zeQ}LGz^Oze*^Wz+m-K(J_w>IO3}JlE zeN*Y*&ILu%dd?S%=fTgnj2KIjh;m6|@w%xSCNLRf4!lxGc4K}gUWAy{k2)=~=9#-@ zRw&9}i+!x{Q-^IUnkAM(^QxaDEpeK;7%Cq7bxfF@x)^kvJNaV zL-!rtzloL$`X-WcK9=N+v=6p}j-IAAcCzc!>hY-!DfYp(%L0$2&UUxL6P3M;>0J;MT08@mia5j@0*#N@G_+&zK%%~eRarLl zzYj|0cv#XOXzd?x#fT46AngElf)Yt9D!6Y#?vmux-%6r!g|GlW#Dt6vPj|7MODzFm zN_2LV9p|*PV5a73CGq3KZdp1S!}1kONp}|CONsUTG%QpFG1v1u6hdAVp$y0RVz%F& zW?&pBSpT$?TS*jyibra2_P2uOH0wC9VhE{Nahr{so(E^MlB95=>u?4Q=`TDX&t1iN zv9KMmK|yY3|D6_6G*LUG4D)BmbO=8I-D01D?#@BFYv>SBN&!Iul_7@i7)ohDQffdzx;s2;-0t6g z&w1bXT-Uz#A6|$v&$FKO-1qnXe7~zytD4C1bgdl#yijA$o$?#{KZZTmQaZb;rAH~t z2{53W;;2YOGt=gyd*wui)8lTW(ud;~J7-~X4BScN`E>mE*Gt>{4X<^vKG?uC$)NeB z0=HH|?O|c&fA2IhvTX=DlY4)ri<#O9@!H?uz8M%U^AtI>!wul#j3Gh=UnUHcpE zcEwPcad4U5XBZ9nv=h}?!>5WrWoK&kYkq5AMwE8NdlSQm?ZX~~^^kJY?&W{A@Y&e4 zHfYDuy}`?y%PlX2a^|Ek7g?QW@r`t5bB!d|YVI<>$HnrhR&x9jcwZdJuj^v-YGtK? zc+7dz*Nj(d>KrtF4-k$vBNEGd1$!)&q6ZGPW@oz!Z!)mE)w_{LDtMYbY3U!fpf_$gN8Y!?&**mtoJ>_N00ff3SqR4hBnK#}`E%W3;vbK9@N>k{m1XBXu2B z(zNS0ggLqNB?-u4Mk-WLu-kY4)&ATtPfkPFh~$4y&KI^|+Ld<3Dy1g3@*$eca-!!a z4Zne#XGwqjCB9Bn4%ZdjjfNJgz0&^UsXn)1S>OHwc$I;3?Jd4FfF+YKT0Kpy>^^zYy&&W{IG?TJxw@K^6f)iLzogX zO)B-uV56T`&@}JOv?c~-2{f}w7YKfT*j@`HHvdzp!*O;Pk7Ua!U97IZ#xkD&4{;_X z6ilBiXg-=_w=8bJ;$U;Pqq`!CX1+SM2lD%iPl#Jv4H&yv8Gj#IMoyCMS!Vk&Bvsh9 z+$y=k9n*GB)`kIQ8h()2#nqA`MtD34`X}MtRyt5n4(68D8 zR~36VziR|k)_}kor?2!+SuO$pfL?flf2}Ja_m92>R3^-%KH>Wh`@;VUgY(2Iqt!Vq zVFsbTxZ$+tD|HQEXa62E0fIEXN(TbzO^Ao-YxtcNUX!Ye(*v$viHfI++ufAuUvKVP zsm&+_21Rcfijh^n9Mtevyr=8MT?kW8qQ-b1m=lkof(1B`93P?$Cs3bBOU0t7;jTBN zt;|MlL=55!NC%W-r(_xS*pqJJ8@}1$TQ{Wsu-6J#$~yQQOFzS|gxYzm!noLDX`1uo zU3Oa&H$@ovxshK@c8sBcsRx68%aUHF?q`!zadT^^i7qp@ACjQh`UITpn2!Hc-N{AAow`ze4#pI~gxS1piCX%m#ha6{REP1O`2U*oxoK_F=>e zNJ9A(SNIB$wH^3QEw7cDO@ZESrVHK$6iG5;XC!ls*Bi&O2eE4_lpw`gf$g`X`gnya zMq|t`@0L=2N(2}4r>DVFe8Lff?C6lCb%oCrv!n8PvfFC(&iZA_dlue>k#tvJ_o=;+ z*3}VH=Jr~Ay+YG{4gYb>)T{8^JArOo@WJa52ER(5WC)JI1jt7xD#x65!2I4N4}3VT zX}+H-47wx(vXhPNQRM9B9`n?C{K+Q2B)lObZ!|pGzegj-23B3l z%Y83}dI$x}qDH9Kw~^QD6F;Ef{<_A-=S3C^CT6Lr6wA@ z7@8JcNhi$Vvp>!KBA+JNhId-pPKu**U>2lPHi;Qj1jsy*f&3y?nA|!6Ju<|&e~R=% zTwSbR5tMO%dM8aS&`%|Mm1g`I3;y$GdbrzC>+Q@{7%D@j;`2Zzm?cL0$L8xCLVp4oa zh`LLO74^Br_b$d`5g{=(&px7kam#C}QZ5xvs#vZ<2y5TJ2 z?WsFk>@`nV2o@dD?106zc)0G z$!x&zq;AfBqaOX#bGJjV>`6-*KULIFN+@r@d%~{c%pxkxOYP=l8ppi?_<}v|MMY2I z3%m4-)-e2P+pF_8Aaj@jZSEznjN!cx9c}Knx$SEJ;clIuZU6Zb8+X_+FX@oaEAoV4 z`GuAcoZ$&erY&$?o180z_=Iy^8AcC4x8-7n)|5fcnd_($btw@^*6r_CuK88jA?ds* z&%vH9=N<#D+2dA0teuI)JLno>fsVhz2&UbBQ=~G&^xun% zQ;wygNrCo0C@IuGd}e_Aza2zX#kt1)2<7hssHq^4{ZsTY0JR1XF5WNU%cy%xdTsge zg?(%jv#gbX@a9tDaMN>&)C*l<4`%dP6njFHHRxy+>53bGl*DV4*)D*+4-^PG;2#kZ zl5%E@K{WIbblJh05UvfN96J^N*3>Zg3ng&CBopXQHJW@m0gen7dh-gBfHW+o0)rgm zftZGsVg7KOCs3Ev#v{XqK|(PehK0|l4z6-5fd>9{&>=T%QY0t{PSX4p7X883+?)6n z^WQcg%oDP2?t1J~q|$Oa8XfJ(rT6m55@_EuRm19I5rZF5JAl+qdp6yE-%v2zaQA$m zN?A;+{rJp=834&VCyr8M?{3FgR9$M-d#w_g)^x{?k9bd=w!r7~Ul}EWl7;|1Mo8}A( z7Jruk7`SGre;j|;et2s#t=iSdA$&o&t3l~1mrj*pIB5OvivzST_ftXEWHnLt*O72g z5!^b|qo3guo$iwK zM^#+V2C%>%3%fo0cH>m-9|~pA7&DG9kt*>0r_}xPta4p*Y}n~Erjk>|6g>Lr7F3O( zjgFQamVQ)DXo^+yXpl7#bCU&#lAjS`N~YX+010<%4BbG}qJ<5koWY2$pZk4AGPiT2 zlDBcoGr4gu6aO70wpGzte@Z@7iWnWk+^jFa6OVlMx4>;X?Q*6=r;P(7d)MbRlJK^s zt(1{4!JrOG;G%BIt*2u{934*i(XAtxi)xGy$f-X09!~mV{T9p+&p|o-TcFsIAm&c{ z)oq1om?BdPXKctNQ)hv|3iUYUm-EHyRf~0a>4`xf7S+W_?Ceec+~0Ta0ZKLX*jIn! zpBXe}dKE$yQbD~AdJx@8ILo{?Em`fh(`tK`MZ*2t4R32fuiSNZ{NdK$Z~yt(?jnPh zPuE>Y>@;HV20HD$YQma+zzvWBeHFrecSYA2VyT7_l>$>%xTYj(E=h{SR`^y+Td&ev zT5S!O(QOR1-Qjvgo$ye6@c#&~I;zwlwP>17V&Rv%m z(fvUnmQI>)_owk4La2|0$VPXntD&O$UsNRHHF(EcZ|GI3|F`7T$i@Mg+sGd(TL#V> ze+{7@0KpQr0KbXb{&XY4weA$aHPEzKVC{vV=UbwMd{C?>%|HW_X4>J>6WRi_hx8@r z%Ncc$9f6ZKU;xVHnGfd}0=fktgSOqwnFX%HCMuKg+IO)pgN!7rK+|t){JnQ?1#cGv zQ1*JGA0Q%z7ftWP<=yr8@|iP0m(fo!kQXY*Of#71*JNf(-Wcy zwtPYYpZg$-D(2{qN@El2Q@~LFF5jB%w8z*1Y@&n;gi9_hfWX-5C;L-tXeoh_r`k$W z50$V4g1+8L-M3nfuC_zdk|-7PLtpeG&EG8lkT7H+-C=??ac+~eycBoXUIbh2gI8bc z^PaS*1%fs&{Tl#C=(1W570#}2uH>K=`RAn7LwbaDRCxAzPyV9;Gw^yA`$=uGJN)q zCpSjRhs+&wTqVa0pHs)Af_ednAiYKSH1cTLtK70Fi{?19W$1Z_@6D>c4d8Z2k+ zm2x9E*`{tdAkcfV#7f@1gamOlgDnpQq+xB_;IAJ``W~*Q_wZIkzM9{aOWCLKvy8}L8n$wz8O%7G(iZv zfKsm4aqL50>YvAKB&8S3J8(B!lJ*e{_CUqN8eZYr8_)psJ!RJi_^WeMRxd=4h;yBF zSWL34bn*RFmrul%acy!G=a*_-c4Etd-amS=2(2#ogZF!0&vzdrrZ%PU0VJPq3@V-`8i*3ZuYF`dv4 zu$*5&y-wsgzDmMC%Nnc+gjeZJaPn7%9^rffrEYp%LI>x3GEr0?I7ryn5xE|o&eo$7 zUQqFHDKAC;F2)>gl)_V$PTfGPw~xN>`)_H@(1T&@Q|0=VJ+T_@ug(|X*{5oAoN%wKfWeY)=&Zn(d>v) zqv>rS#;dq4h72`pP~UtWd9Zo;`Krp*fh9)CyJ=W7YTl4V9(*i)GU9SIX%k!jXCaJts3FJ>= z5k5v44Lm91D~AOm}QYk9* z=FNSVzKcw1TIjWl+uACI?f_5}As^sAFeigw>VwHs?!GKZu~!-7M(_xH6&g4p8_b^0@919OaG6{3H-I_i{KHOsd$?bXb$=r-%)&j7kBq2!G zALOGvjm_Vb>2I|^-(_X-XPs&j&Wy0wh11Dz0Y3Wo@94Ydi6WdjNWByI@-`aN66|V9 zY*a`m0%NFRMb5))>1*H8#rCzuO*850yn^ZEPr+1)3F(?idHvIX_!?pu6+;{10(AZD z#mEEe?6ZhVxqWr~^NBKBv$U(M3`CZOB;m`jq8AXeucRR_TCatF=MSt&3|a=jji8ag zKLcSGZyy++F=qlAFMTIgYxgZ|w$6WC@DUQY89Jmtkjp3pHM~mplzG`N;{@w%q&}r0 z=^;D<0A&jSR@D<)Adj6B!|losS5kran!_&vG5R^vqn0gQ4F+XYG4)W>0(|!&AASWi z@DRsjjIARe6+6Ge=&~`YYzTv*DGXR5&rkMfj-F;QIBU)#1&sjD6G>fFiJS6 zjl|ADrUZ4y1a{Wp@Q^_LC*;=vb+`nxM$uNIcmZx)hOM?uXdOf0&1 zacvN9qT6c`VHlDUs(uVfMTnFlAs=89qetaP$WQ46#=)qb4-Y>o)H$*}@#6A76ZVnN z`4GRgv+uF?1O;8&+Fv;iTn_wr%eAB>`LaT~nNqJ8jO<_rrglm?7CecRW5f8Pgg>CH z)7eXOrJxHozb{!rV_1|%LtEH8jfU&s@w$eekOcwLPQ zBe2GB(+vi9@v=m-lqLs+AWd%&@J3`9Da_att65Etj;o!1Ep-a2%hOrswzIP#8;r!k zvIgi?Ct>@Jl!XoR@fwBD&G5VhH$rza(rAMv0`V~OLQZD#-TIe;iwFTRN5R725a77 zv<8}QhEM|L6$jcIJ*IOZm$}(y%FWfc2e*hBwn^+Z_8taJre8sN_ckh?725fYlT3c~ zOE}cyI?mX;w)g#yM*$Se!bBL9U?|x>^1tRNszs5VjIay(lgWKhju^zt?__~n!lg>HWVtPpFAPL=Mf z?etIo`Ooi3;{+tut{a=2)La@@X{O?{cDUG=Q^M%4~E%aR_ z+*L(WKuJd6-8C}Bl7poE`;22cK=5&p>^c8PW&c7LuUNK0iI%+`!)#UVGqqdjrz8 z(gH6|)4}uG-}M%B^SxjS>*9N~>4dS42z5|tA3IEvhbXfJSDpv`J~P<^omM)Cw;1sU zpkkA6dEdP^CxG^_(*a8BFtAt%O;{$MfWPrC4)7}tTqv*JK;6RshVQbtpIKUjFAqiy zO3Y&byZi<)%+KNq2-pT8xc!oQ8W^xdhuilrH~X3P;UB}qpYN0L2Gn;;ni@l`$n#c`|QMhGTeOZGpAEdp~;j0qAEgsr27caKqtgVetM^hq;$u!udA<%b}{!lB!UBc9CW=Z_20HGBtdBoz|cY< zwaC(lzVQoNY#JEMmlykJA$lXO(_3Ri5h45UoO{5v#}U&A2D%zRcA#8#9~2-^d~6B$ zlvYf-_JYU22^7(G7>ltti3+5==H{eU& ze0+(CT^a-;ki?wTEyaUkGo11Nds%m&7noUskwh>99B_&BTOL4+Gx#l4oed4Mmvr4VVe^kdm~}H54WX z<5OPRJ@q}n11RhKRI3;uEO*k$R>Lz$lF)am9GO4W0{I$AnPy3<=J?zcP7h|^Yu*q+ecE~B-oE4tgc`dKa|lxD$|_ZFiY^zEvpgffN9ydZw0))C=y| z1x!*&)#WWI$V=Q+%s$*T8c>1Lv#cLR-LP=f0^SM%5MZ?`!x+bUG3G!-8ss^hprxwg z$S*dt^yrS?fRjsX=fD<;Sy-_L{kmjqJx%v zQkebLk&pokY&RUHZQk?Y?8~1H_XM~T?yQ3zPtY?MH5&>25;>4w^5P#)R62n${Jco{ zzg_iu2sj_7#ehODC?b8M3{{pW)WcnF*@-mQwS-rS&D<$?`;xApF|z{o)}{KwPxxgA z%x|rAa_tN}ODa0FdlxS7{$F{i#@)SSTqultq-)U_Y3^t}g+nUIPrjA?``jkh8(OH} zdIz zFnf5Etis5c(8Coz^RTi4gg%JXK>2Mb=2eZZy0@*IkH9%g;W>wKi*cH zk`x%>bxnz?1|I)Cv(-0Hp$B>@>@RU zs$zJEXc0DBeK_F*S+QC}vAO(I2!w8rmNR1LxO4viAPZX+CetPQSw_%VZ)0}=hA)$j zZ|x`0QW%LELGn+plYt0IzzUlEsfl+gOtNHUP!%QM%z-emG+73D#ILHFDlfCy*OKp} z1gk2K^fMk$SZqExYMRS5s$wdlPQc8+#;LDaVcx=4g$Cj?tdR`!XYHq-c{aK&&98O2 zRtMI-2XT!Z4Eps^3@1Ch-awUOL;M=vGM>ukSFi?hR18hdhbepPlrb#=>ob6oQr+$< z!fSpHUDer}c|X(e!`o&k!9K7VMR)LkZfqpYs?;b?a>U z*GU2m{uIdX=rA&Tr}48le7VDzAbq4B~iT_(0c#9=4f5XcBv zk;(c(N<$fNx*yG1k`ip%+U1cnr=ug*8F|~M*6t$c?8mJ6Y{_xZ_1^*)_qh;d-FKtC zdHX=+lN^}shTZ7 zhAieJ zLc<^924iYW)>4)T)2}*CErOnapo;-4dbgs{iVX^g&Tqw-t0Dsw2{w1sLcr)Qe1*Jp zE`(sCzdQ(h(26f4-{LxIOJBIdCd@n3RL_V=7PciDfI*UH7vXLW&`b4K&u&6=y1CYT zD*<%#rC;B>EGgV#M$=5@yZF1saaM*sM1)UW0CyYmh9-s2=9&;tBkF2@j>XDRvEC8$RkH^} z?7s(z`>ej}$gX+nt*hWY`|=CKh+5(FDO0=xrPLpp@Js zKJNv6%&^vaUw6UKeaJHdFuV-N{8(-fZX10oC5pybA*~aYW{0vXY{?BFT&0USit31J zOTZ814p{VN$!B-9g742=gQ27Gj=(cQIctJx5r}#$HtX%Non?n|jYL*@+KY6=vrTVTX~R{fG)@xQR3spQiH7>mT>N6xDf? z;~MXPV_5t_0lZtZ4{BYEYC+V5HOh;BE36rEi1*`eRBtH-A ziLd5j>PZl7nI_g*J-Kw^H7%`u(GJU?U8RL@(i;1&+~T94?7DtqSJJ%ExMw}yb?&9# zGI$2KV?{)n%B2nI)AU|4(PD^k$}l?OclkKwk*L{rY8@ymVL<90N#~;GadABmB!?$% zCKLjrIG{CWN!m|j2MgUet z_3)cSNvNQF)bO!<-o7v8%l7dv0mm^7bV64Esa{ zavWG)3RztoCq=?ndb8V9-mzdkxBz!;)_oo8sOxsEuf1{eJ9m?|PNP=pd~$A}#SF>f zcFQ42^I@5v4vHP~J0j<=9<+3QDtEJt@z;@6bYs4NW>(%d1QzKgY3Og(Z8w?xH6^+z zSdIY!H`X35&+=AAl_1Y42t##zths5VNZ)0FhSMCP+{6V(dw7`)+C8|b6lOmVvxURb zZX0Q;8dzKFmFpy%=uFwe7hGG#CJ%!T`Z7r3*%;Ul;pcQwA9ZDAiS~HQ#URdth8YOI zkdV+p!Taj33NG!Eh#b)FAC<*O@B>TV_4?Yd$23m1)g0 zGw57=`ncZxT55)gTm*Qx(0GBI_g>Xn^#y^rXJqI_b7Q0c%@?ruVhu7m^T2hiXp zJdU?I3-yA0C5XKPvIGU0tk>Zdt7LUGA8#^5rZjB-WEZ;-V65VD`W9!hgm|PQmd}0K z6)*^imW0yPdpnX-v;{vauGeZ_!qYWQSqp6t)?3~|uXS7W?Ehv98Uv~2kJzMBELVbU z!`^M|^5}DuGk5SZ{b2k36>W+Q8+P(S-5M|vY4R|-zzkUwjmT_Q89$RJ)RvG9Gcqgo zBU;}p0~Ch{z@i_Fu^>4kJD7Q)fCEPwFE}+L9j3sV)GFGJM39lFy1H2DJx--gAg_~$ zy`UraVXc__Ib*Z^AtiW?#EVp5?$xc&ON$b$AZy@y{pnoRM^4w~$YwmyF*;iB4{qRx zTI$SA&v#oH#0O0y&S~cifo4eI^OEmq35+lF57x6Eku8ae^6?M1RpL}ruCxTKrwK%* zix8;Z@VBk^cQttB^_#L9jPTQ?d1u+=GA8uEw*+&Z>|8?>a$H(^;_{cBcM<>=JWTiLT2<7P?{2Bg$HfqdEpuW70yWZ4U=&C! zU83T140K8^O67*!?4xW!4go#ujKa?0zU-WIG%KZS$=9%tcNG9X-BR8bR}j)7kyr6#LtCLp9vH}!8oBP&aHqUP896acs{+v z#`FbSgRZO03mR{ld7aSXWBz_Z=zjfq6|m}qf@Oegq4 z?D-S!N)|0V!u_3gDcVF&_cqE)BxeQyBG?f^(yl+A-U*k!Oeaqwkvwuvxw`RzuM8Vf zlz?LZiEwag@s)5tqFfr7qr3ErJsp2NAklMOELkOm&zXj1&F|D5$htk$&FRr0KVtW{ zVtc}PM!%T_#9k%t0Sfy-WYZpbL9Bd@H-B;bT|8?FxJUmRy{C7WjWFm^h%!c(O!8Cl z_-joLiB^3VHyHeg;q-2Mq;r5V)FRMqm@m-!$&n8g047}YEhY8gcG^GbO? zm{AgDBEM5N!@BX(6xfSCd66UGKI6?k4Rlj28!@r5aG$IzBb+FK(emOZM{@l&x|+l( zbL*e8B__>N>zN*R<{eWUu0kWsEkuait*G^v0<^?MCBjb^{Hi#&nTUMMLk!gkI__sx z+RHZra4ZhaO~bZwiVhO}Nmf_D%6kM(4-Q(_3z_ZbH=Zd^eO>j7;Y4ulRzo(*t;VLd zIfmO;CH}_ik3WC_X8hFBs2Fp&akL=j^rI<%a82!Wje6xd33JwNBUXo-Z>?0Uh-U-Y zZ-w)Zy+6eBA!vE)x8z0s6@gb3u5i_CF9;6ki&3paB@zU`9&M+3@gvza?vb9dtyN6L zLiY0258#@f+AH4tNG1P!l%T|Ht}PwBA)Fm|y1tq#`)DV$?T>5%*slTa4CSv7l>2=P z)cDu=Q~=MNR}w_W+zSG6(h?K-)*HZBg*f-S#M~*{&xFD--^hv%fl&687snMH`Zu`T zxMO#2ZLWq|tpkQ3&EPfId61!A9@g~$URoZl9VJ>(8riPTq&@_lWDPF>b^}Kjy}uz# zi;JjN!BRdJ@mhrF-Pg8jccMw%;H_VOIgz5q(I!`Bf1x#IAf_c_;UgW}b}ui0ZQ-Fb zal=-6x&_qYs%lh}<(f^B&?%*AJ* zz=6K0TR~Z`*xS&ss`obGg3xpIKUIB3e9KMYFHP%zEnQ&87r%lPc({T84yoNAZ!|Im zWo?13aav4M(>g?3tm7jKOyL$5JSpeMAFP?w_v1Yb&{`b^^{y|G@}bBSEo%^(s%cnw z6hbZD2lAwtY#i`-EM!^=c~Pem!k#fsZ~N;$^8*s=xfxT%SAC;S7^09+2qe2t9eds_ zN9q*scyP660IgF@w7A|>*G80GIWCH{$f^K&37Njt46m(pJm}Kd7i#o7I_z?eJcMH1 zldd3YrE{=b;k9xGDw}{s0bbV_g6QRO69GFWEc&W3Z$iDb5{E@c?{T(~@V89ok(7IS z*{`HYxdLgqex5EMGFcTO(i#sZdrU$ZiH%A_!Mqs0g9fFvix@9qd*!9U0;kTnppI-` zoHI?fAYf$@OPdc}23yI6|1?%z^qpF(Z@TL4~iSz4%p6lZ^ z8V)|p%{D%f`l+lK!|}nEU0EdxiG-}A-x`-ww=jBi>@Yecx>M{tIMbzG zQ*ydv0qc#!(%sUpnqsdE2TgKYs=>a$)uNG0ti@DwP1T-c>NV((dSS;67V}uk96yH1 zH;YX8)iS)yS@rt?!)(JkSuHl(a`=qjQP}B6zIE#$&~{`jUVN!G?~cJOYrb`WEYA#7 zJBS+ilL@_Nbw5L$_(B=9t)V)VXT43Fgr0LZlVk=m|JXi->9J}*+g33>^ZQpjQVJ%0 zGsq`Ew7mUT0vI^MqaKewQ%aMHo&OpAD(aiYN+uaA75~W`;+6)u+EB7WjnYxSOB|{k zR<7{S8_X?HfqFT*CC1Wx5EuEHyJsa|L~!S^v2`as;Aga5ML12a29!e}9#g(B%}8I2 zNyu7*txvFrd%=fg{k4^x$y`g_o-=EW;>ot(-POy1fI}zR z@aHgdnBT1H-3s4%cHyf-y7f*(pfMW!T6c>Ru?t3`%jp)2Mz>4Z8W%bvJ#;v^7={OU z`5;Y&UULe+!GIwO5sPDzz~z#p1c^qhC1=C}vdpUej=y95?nFKfdxi78bIjpcO-pro zlv=iE+StKIt~pKzRX-mZFpOD*6oYr(O39;!FcWl;E=f`}b~1WCg&@lFv!DO( zX)txHohT^kG2ptHf)vXa)M^OV%sbL|UMy^AnY+HYlPK3`FJFX0QaTPpW^+dPE`Rle zgQGZ3PlqEUkuTEkJ0OtCY`j8lhCe3EtDD4`M;gb(Te{_&i`M8o`3+vNeNtrUGJ%9 zBNx5MfqLruJ~Ffn=uytG{o0j#@PJPFQ%@jR0;`61=A=;p=Qt%D}LX?IM zFq_@NFtxDjbF20Nkm|IEx39g5mcd+hf^&l~S0q+44{m{Ix{EhuS#wyXSuaX3{rQy# z!_3=zS|3q=03e+p(nPxak-jD8Oe)eSw}ai84Z0kKF~AA7EG zynT`IBy|CF3A3qTvWwipj3(CPR6H-iu6ed!+V&DDlCyp$>26{fP+67#WynrXRj@&3 z$lUp{GLTuc)-_2#WP*9EH9C$lnp#q$)~+g}?kabwHQ4Ur8LMG3B< zKhFaeSN&&neS&YqzRN=g8^Z79&fR}V-&G}0$2F?nW&S(5u$9I3Bc0 z{#Kf3UL(RaZp8K&%lNwqoce4QAVlwEjayJ@fu2_Lx@Dt)gCS?Si-*gn>zVR~G+(KH z&C#JUOMT43oS-Y}iP`kVyw-|+#p?LAH^p&a%iG&@JwXf} z!z;#hIumOPbwM8MCfzT5?!mT}-fLbZNS+1bYlTDn*NeHzLMcs7ioTYY^{MIh+{e{B zk}iuyrgURx&lWR!%5^E;sD~CeE5GITVu;4QkB_debr>s#ho#wZ{qiN~fem zmOlKdcNopWbFvVj__w~pkict?EB}dfl*Pry&beiK3rp1SZ9M5aZZs`9{_onTw@O%r z^hd~)%SU`_x`+Bw;TE`Jx+;*{cL~k>RJDs-4Zr$s18HR6gY~y0vI1P?FKd&XhhPwJ z$KDt%V#v-?^{g(a7^L2s+&+3z#LB~^WW2SCJ_g%gtky1tKpICCeZT4y`p-!2`P4 zl%enCm0=oIcQM1YW6vkIrCoa>K){|lZ|zOqU+^D?RC5$Fl{2=WT+b#0_578FsgW7>QQ&<$v{m1SgEZ z^-9SK3Q2!p?uUEig~3X7eQ78i4Mr<8SytQVQ7zW~v!U~1HguZl4{HBz=uWsOr6Fhy zH{V>ca3fS}v|<&UF7!3%CbgauriPVfy0ltj)B$FP?4VoZ5+&<{)wg0Psr3lq#T>`) zl*Bj*SB{TIl;(Pg5bhxT1T$nxe~j!UT3{_(!pm2vuOV>tX(}@Ey-El|CmSFKl8djj z=l!AU{0{uBd0p^Ot`d>amv-p1#7jVMv{Mh8`vL3zncE%a5KczZ#E!;7=3K*Dh1$is z6iM=O*jki#G30F1W(8J`jIMMR(n#w>#>Wp|be@*&F zm6lAk8dauHrj##M*eSWkR#G7PgG-q-`=LMnEYNdd&9A9LKoIn+U(0OSUy22Z39T9n zkz^i65~Z^$Jgk!~NZ?p^sfPrJfZgqfJ_RWr!%|+cRDZyPR- zD;pB1dHPjXCd>>ys|gelKOLQaQWZC{zys$w3%CCliTX)N5Wuj9eszG3d>7@xFeCo6i8vZlmr4~B~SKAHq_Jcdz(->P47W++A7DKzujYmWuDkq`AyAn>lyIauN% zTqbb*^0# zdi4O)ZKWwaJF{a=F58jIv_i&E+j{evPJd-l)p2FZav%zL#j*5!tNzZpkMzSkKNdUg zHP$@UVYHW0k%E~}q;z4&XmAV=eQ3?_?!UhrH!Q&Ug4*>H*;1@QBJlE-KYH3K{>J7L zXUh$A!s)igE9$P;UH7q}VsB_^@yk4}CdcXEHjuHpP{=4*N zUJ0~9U@PEfH}7NwQTVRkv!4#BZt`AF^vZ=f*TAsq;rS*HkOgaVxo}EqI78Ks{d(u# zEOgv+fT?XRg3`RMqc?_)=5`MM!>041WE$EpBtgw!3cb-gOF6_2!E9i#Os>9l)%yGB ze=?~vFiNWR`m@SE<9bjYU#eUo=Qr%gTv_m^viKUt5@H+9A4&kH*80^Y+A}HWJ~8-7 zyMTaIh;b7PWPZ2Cy|&NVPSMgm6yv$F`z>mq*IuR$nV?pGMV73yn^cY@iHJ)?`c!X_ zlggYd(-;{4r|x20t|11;&dHkhqREtz!I&ZEdoUFJTAs`3KeeJSylrf5(_N!Y{I5p?BDzq(gjIC!^L#|;liIQO2t%hJ)m>tcLkrd96 zoo|k0w|Q$B@MzsHrW}wwVh%HU>vgbH%OK_J{%Q4tQP5zDJ+A9`jZO6K#4KoU7kn|P zr7&jpPzha@<7ZmU7O-cOyuK>Zz=GZrz)SZEw#7MS-HkCnvMGvQ!b}9b(THRNp~5RK zR^WrjMW3mD;VfUaj!=H1K?|%g4-jv~3|b9?&!3{FVVk2#S3^TMYw;5<;&e=}C+w=i zih*96BMk#$t3aT7AjdXEq8BHdqp@neAX=}5B3+I~ z(X=Y6id%ejmnbOKXKTAU)ZqwBZ0H5TLTy6NUY!Mp;B4wztIWNh=>RINg;1fANhHqe z_LBWe&F0uNclfoXYJEmyw?pw^B z*+s`xn%@$oFbgO>^pI8^KB=|fMxMN51k8V~Bd(P{JNsq60w)7yWTK_?Q==?B1Lo>Pau(;mIpaFErk;gxf>m5YT?mdUbZ^Lc^SW0S4`k5l zYP4%4h1<0~f+OUrwJ9GXbWVt&g=DyNpz#vm$sJ!mYI_HOoY6h!$7JcXLBnBeywV;Y z;(Y>oWqp1bf5}sKIVs}FFj7$RF(ykjniM=z$cRn&gH1M)rG>{#(q#Qlw393)WvPzb z5)j2iM?ZItCl1x*>G%tJW*{)gL>x8M{PR)y5ol~R?lp%43q&45rYoBSFr$q~ZHxGe zaiLL)9+PtHJf!?HhS?bH&OqpiL(v3sG^+Ddm-jWCnl|o2mYX_Nb+r0DNPn8VA@OXg zp!^cjMt{>{n>A@Tu<3Y`E*U|eR5 z3*sQR8L|l#0I27D&8CjwJ%)NynbCM^^#w@;buJR(v-w2R;Ix$xDqUp6@)erHvZ>8w z3QQKiO7vL2LdkoeYbF*QPC-pW{XjW(Lfpzkf^VAQD=mIhLVX%fJr`GbPgIiKjHQ@5 zp-sE?Y!o#Ng-IRxI`Bf}x}n*vou^*+_9H8mA6nWd@mk=e)I9X=EV50mamM$ZcaKiK zk3aRMfkvNly(JpCNjh)Ss~>g4!S3ys1iw!Gpss-sk92bc-}m+dU*>VFD;7MX3zr4b zpcBg_aDd$y>(Vdi9B80^Lovq12&JKjIJzC;hRAcyyU9~*(DTRw|JuCGA;w*3SUvv^ zxk(aNnIJl9{@rB3$P!=w95$0s*&yoquB5{XoN1!0a*ohLbML%uYdQ5z7}j&JY%3p{ z+y;yh{aOacrRKbpxsfxi1knejs!_HPXija%iN4mhGm@xv@pfg{$4eu+rh*5u(={?h zBMr!MKDWI&p9;*s=oY`$*XV@z$O3QqICG6+=(fX#!+LLGyUPGLjl*ocB|cVXB*lwr z%l9&)~4v~^=9_->&ecM9)6bIOD8{W zl`!%$2QtrT%+;1U^Q`V_)SOph3=jbP$Bc7)6+HeD{UO&hf)my1`5&L}#2mMD`8tGV zX4sL5k%b-;D3;DI)JR>GFp{`@p-7;zz0`(WSMjK8%yEq?#ROZQa7GF7D;SohqzMQ~ zT>LP}ej(vsYM_26EGqt&y;1bfm=2$VU`q314?8omWHJu}E=Z&m&k3y8uSk6(a&e`_gcc1L#1sKkN5^yP~1m6142hs7y4_BYjO|YF)ddhKWfnzp_edVFJ=oPQJVLh@Z63Q`0_I_+lan9e;@{*X7^|2S0Pk+ zPmBFT%f(Mz1BY*nzaw608Vq>vQ(pAAux`A&ZuP@#8k^36uCI6FxH=kZKYB-FwEW^9 z5#ci?)c1yd%>Ux(xJ3|-LLLOz0v>&u;u94fHn6FJbhIxO?GPSvTSe@s$50Qp26MZ( z4y3#Q)|yrqw0R14Q&D$cu7Byu6@mv36y=^GtoE#BsFuGU>eBI>&IOwW*jJ~;ztn9T zRLh(owBmNy+Z{T&rV^~jgkj9i{r7(K3jneIPgmEAF@Qeb`t0;~G|PhRgkdx{#&0o< z^=sYvk#!{{7n~>KZ;nR%9@H6BeHUdKa-nx6Bvl^kV)t4 zuMi6|8^wVYqmmsst|cV$91XB*7+PkzhIe+(`lt@oimy*rb;dsv=OqT{S=xB2c+>O_ zdTIvya(-hz0#;c_o&eHFRb4XwpVj6StII?`)qx#dbHR~w!-YAsv(9WO6>|vE0G<;6dc(7#hrmq<=VJ^`$6gO(k!obv zR{*+s;;_?{7U25ot2tXPy7dNZEd%bveetQE_35o>hc{){4(+&qfYo(r!~04$nGetV z;-3H)Mf!eVRnordc|BU_K)z{r-?n4_D^JIix`9^1N`a0&KlWbvD&!b-(5%2*!3bcF z@GP*oB7I#-@d0q8te>^9>{^f6%6GssixPo#i$T1ZO!e1SPy4qjGxfS9oS9*m2D}2& z1h{pyZ}m0c?Lx*$M>^bAFMDMy$f5AB?De&?8-7T2zqAD&)vbWj=U|jzcr(BmxE5g<0?^lr8y8z|oN#8};va<#mCs@7N5SoSF$-JA4Xw?0}K1^sFf8*)0=5IVffOiXeCn zdfUsePv_(5NRbUQTopn+gg_UOC%@kPfax2m)2*vcw&Qj@-cd`0$_uF&q|y&JA8+ySNJF++Z}d4${CaX?NTn xgs#D7;i$79!8RIN&~O+{TadgqXmeuYf98hH?J8V+Y?%x|;OXk;vd$@?2>|cxadH3v literal 0 HcmV?d00001 diff --git a/publication/chimp128_compression_ratio.csv b/publication/chimp128_compression_ratio.csv new file mode 100644 index 0000000..b2753ce --- /dev/null +++ b/publication/chimp128_compression_ratio.csv @@ -0,0 +1,31 @@ +dataset,size,vectors_count +Air-Pressure,19.24,134493 +Arade/4,48.95,9657 +Basel-Temp,31.14,120 +Basel-Wind,38.35,120 +Bird-Mig,26.43,17 +Btc-Price,44.96,2 +Blockchain,53.19,225 +City-Temp,22.96,2837 +CMS/1,28.14,18140 +CMS/9,25.65,18140 +CMS/25,57.20,18140 +Dew-Temp,32.63,5287 +Bio-Temp,18.86,371892 +Food-prices,24.64,2002 +Gov/10,34.15,137816 +Gov/26,9.26,137816 +Gov/30,12.92,137816 +Gov/31,10.43,137816 +Gov/40,9.41,137816 +Medicare/1,32.32,9070 +Medicare/9,26.02,9070 +PM10-dust,13.66,216 +NYC/29,28.71,17037 +POI-lat,57.49,414 +POI-lon,63.19,414 +SD-bench,18.78,8 +Stocks-DE,13.63,42544 +Stocks-UK,16.76,57915 +Stocks-USA,12.19,275465 +Wind-dir,27.80,194237 diff --git a/publication/chimp_compression_ratio.csv b/publication/chimp_compression_ratio.csv new file mode 100644 index 0000000..4326d7f --- /dev/null +++ b/publication/chimp_compression_ratio.csv @@ -0,0 +1,31 @@ +dataset,size,vectors_count +Air-Pressure,22.95,134493 +Arade/4,55.56,9657 +Basel-Temp,54.12,120 +Basel-Wind,54.73,120 +Bird-Mig,42.06,17 +Btc-Price,48.07,2 +Blockchain,58.29,225 +City-Temp,46.25,2837 +CMS/1,34.77,18140 +CMS/9,18.71,18140 +CMS/25,59.53,18140 +Dew-Temp,51.81,5287 +Bio-Temp,46.34,371892 +Food-prices,27.98,2002 +Gov/10,45.75,137816 +Gov/26,2.34,137816 +Gov/30,8.86,137816 +Gov/31,5.00,137816 +Gov/40,2.63,137816 +Medicare/1,42.72,9070 +Medicare/9,19.08,9070 +PM10-dust,24.44,216 +NYC/29,29.57,17037 +POI-lat,57.70,414 +POI-lon,63.36,414 +SD-bench,45.29,8 +Stocks-DE,42.89,42544 +Stocks-UK,31.33,57915 +Stocks-USA,34.98,275465 +Wind-dir,53.89,194237 diff --git a/publication/gorillas_compression_ratio.csv b/publication/gorillas_compression_ratio.csv new file mode 100644 index 0000000..9188df3 --- /dev/null +++ b/publication/gorillas_compression_ratio.csv @@ -0,0 +1,31 @@ +dataset,size,vectors_count +Air-Pressure,24.48,134493 +Arade/4,58.00,9657 +Basel-Temp,60.47,120 +Basel-Wind,62.20,120 +Bird-Mig,47.82,17 +Btc-Price,54.67,2 +Blockchain,62.89,225 +City-Temp,58.82,2837 +CMS/1,37.23,18140 +CMS/9,16.01,18140 +CMS/25,64.35,18140 +Dew-Temp,54.86,5287 +Bio-Temp,50.69,371892 +Food-prices,38.31,2002 +Gov/10,57.45,137816 +Gov/26,2.37,137816 +Gov/30,10.22,137816 +Gov/31,5.61,137816 +Gov/40,2.71,137816 +Medicare/1,45.49,9070 +Medicare/9,16.97,9070 +PM10-dust,27.50,216 +NYC/29,30.56,17037 +POI-lat,65.95,414 +POI-lon,66.10,414 +SD-bench,50.70,8 +Stocks-DE,46.18,42544 +Stocks-UK,34.75,57915 +Stocks-USA,37.20,275465 +Wind-dir,58.14,194237 diff --git a/publication/l.py b/publication/l.py new file mode 100644 index 0000000..44552a6 --- /dev/null +++ b/publication/l.py @@ -0,0 +1,11 @@ +import pandas as pd + + +def print_f(): + df = pd.read_csv("alp_compression_ratio.csv") + for index, row in df.iterrows(): + print('{{"{0}", "{1}"}},'.format(row['dataset'], row['size'])) + + +if __name__ == "__main__": + print_f() diff --git a/publication/patas_compression_ratio.csv b/publication/patas_compression_ratio.csv new file mode 100644 index 0000000..0f22e62 --- /dev/null +++ b/publication/patas_compression_ratio.csv @@ -0,0 +1,31 @@ +dataset,size,vectors_count +Air-Pressure,27.87,134493 +Arade/4,59.10,9657 +Basel-Temp,36.41,120 +Basel-Wind,48.86,120 +Bird-Mig,36.08,17 +Btc-Price,56.80,2 +Blockchain,62.58,225 +City-Temp,24.16,2837 +CMS/1,36.78,18140 +CMS/9,25.97,18140 +CMS/25,70.09,18140 +Dew-Temp,39.01,5287 +Bio-Temp,22.88,371892 +Food-prices,28.29,2002 +Gov/10,35.84,137816 +Gov/26,16.21,137816 +Gov/30,19.28,137816 +Gov/31,17.07,137816 +Gov/40,16.34,137816 +Medicare/1,39.91,9070 +Medicare/9,26.27,9070 +PM10-dust,19.90,216 +NYC/29,38.78,17037 +POI-lat,71.72,414 +POI-lon,75.88,414 +SD-bench,22.80,8 +Stocks-DE,20.77,42544 +Stocks-UK,21.49,57915 +Stocks-USA,19.21,275465 +Wind-dir,28.14,194237 diff --git a/publication/results/c6g/README.md b/publication/results/c6g/README.md new file mode 100644 index 0000000..ccc175e --- /dev/null +++ b/publication/results/c6g/README.md @@ -0,0 +1,50 @@ +# Info +[c6g](https://aws.amazon.com/ec2/instance-types/c6g/): + +--- + +- `lscpu` + - ```shell + + ubuntu@ip-172-31-75-129:~$ lscpu + Architecture: aarch64 + CPU op-mode(s): 32-bit, 64-bit + Byte Order: Little Endian + CPU(s): 1 + On-line CPU(s) list: 0 + Vendor ID: ARM + Model name: Neoverse-N1 + Model: 1 + Thread(s) per core: 1 + Core(s) per socket: 1 + Socket(s): 1 + Stepping: r3p1 + BogoMIPS: 243.75 + Flags: fp asimd evtstrm aes pmull sha1 sha2 crc32 atomics fphp asimdhp cpuid asimdrdm lrcpc dcpop asimddp ssbs + Caches (sum of all): + L1d: 64 KiB (1 instance) + L1i: 64 KiB (1 instance) + L2: 1 MiB (1 instance) + L3: 32 MiB (1 instance) + NUMA: + NUMA node(s): 1 + NUMA node0 CPU(s): 0 + Vulnerabilities: + Itlb multihit: Not affected + L1tf: Not affected + Mds: Not affected + Meltdown: Not affected + Mmio stale data: Not affected + Retbleed: Not affected + Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl + Spectre v1: Mitigation; __user pointer sanitization + Spectre v2: Mitigation; CSV2, BHB + Srbds: Not affected + Tsx async abort: Not affected + ubuntu@ip-172-31-75-129:~$ + + ``` +--- +## History + +--- \ No newline at end of file diff --git a/publication/results/c6g/arm64v8_neon_intrinsic_1024_uf1_falp.csv b/publication/results/c6g/arm64v8_neon_intrinsic_1024_uf1_falp.csv new file mode 100644 index 0000000..250c7b5 --- /dev/null +++ b/publication/results/c6g/arm64v8_neon_intrinsic_1024_uf1_falp.csv @@ -0,0 +1,63 @@ +benchmark_number,name,iterations,cycles_per_tuple +0,air_sensor_f_fused,3000000,5.4299 +0,air_sensor_f,3000000,5.1922 +1,arade4_fused,3000000,3.45965 +1,arade4,3000000,4.63204 +2,basel_temp_f_fused,3000000,3.68688 +2,basel_temp_f,3000000,4.74089 +3,basel_wind_f_fused,3000000,3.6037 +3,basel_wind_f,3000000,4.71144 +4,bird_migration_f_fused,3000000,3.56467 +4,bird_migration_f,3000000,4.56014 +5,bitcoin_f_fused,3000000,3.64717 +5,bitcoin_f,3000000,4.74717 +6,bitcoin_transactions_f_fused,3000000,3.64015 +6,bitcoin_transactions_f,3000000,4.7588 +7,city_temperature_f_fused,3000000,3.5301 +7,city_temperature_f,3000000,4.52529 +8,cms1_fused,3000000,3.7873 +8,cms1,3000000,4.91385 +9,cms9_fused,3000000,3.53923 +9,cms9,3000000,4.51555 +10,cms25_fused,3000000,3.74367 +10,cms25,3000000,4.87515 +11,food_prices_fused,3000000,3.62235 +11,food_prices,3000000,4.69655 +12,gov10_fused,3000000,3.65237 +12,gov10,3000000,4.76616 +13,gov26_fused,3000000,0.501208 +13,gov26,3000000,3.88891 +14,gov30_fused,3000000,0.509926 +14,gov30,3000000,3.90446 +15,gov31_fused,3000000,0.50336 +15,gov31,3000000,3.95155 +16,gov40_fused,3000000,0.505728 +16,gov40,3000000,3.95028 +17,medicare1_fused,3000000,3.74354 +17,medicare1,3000000,4.85281 +18,medicare9_fused,3000000,3.52862 +18,medicare9,3000000,4.51003 +19,neon_air_pressure_fused,3000000,3.24981 +19,neon_air_pressure,3000000,4.30185 +20,neon_bio_temp_c_fused,3000000,3.51983 +20,neon_bio_temp_c,3000000,4.50879 +21,neon_dew_point_temp_fused,3000000,3.57892 +21,neon_dew_point_temp,3000000,4.58457 +22,neon_pm10_dust_fused,3000000,3.38266 +22,neon_pm10_dust,3000000,4.40388 +23,neon_wind_dir_fused,3000000,3.23784 +23,neon_wind_dir,3000000,4.32095 +24,nyc29_fused,3000000,3.74998 +24,nyc29,3000000,4.87682 +25,poi_lat_fused,3000000,5.31027 +25,poi_lat,3000000,5.43628 +26,poi_lon_fused,3000000,3.99858 +26,poi_lon,3000000,5.25359 +27,ssd_hdd_benchmarks_f_fused,3000000,3.54425 +27,ssd_hdd_benchmarks_f,3000000,4.5505 +28,stocks_de_fused,3000000,3.54372 +28,stocks_de,3000000,4.53256 +29,stocks_uk_fused,3000000,3.51056 +29,stocks_uk,3000000,4.51019 +30,stocks_usa_c_fused,3000000,3.52934 +30,stocks_usa_c,3000000,4.52343 diff --git a/publication/results/c6g/arm64v8_neon_intrinsic_1024_uf1_falp.metadata b/publication/results/c6g/arm64v8_neon_intrinsic_1024_uf1_falp.metadata new file mode 100644 index 0000000..7f6a44b --- /dev/null +++ b/publication/results/c6g/arm64v8_neon_intrinsic_1024_uf1_falp.metadata @@ -0,0 +1,27 @@ +2023-03-30T18:56:20+00:00 +Run on (1 X 243.75 MHz CPU ) +CPU Caches: + L1 Data 64 KiB (x1) + L1 Instruction 64 KiB (x1) + L2 Unified 1024 KiB (x1) + L3 Unified 32768 KiB (x1) +Load Average: 0.20, 0.15, 0.14 +cmake info: + source_dir: /home/ubuntu/bench_alp + cmake_osx_architectures + cmake_host_system_processor: aarch64 + cmake_system_processor: arm64 + cmake_host_system_name: Linux + cmake_system_name: Linux + cmake_c_compiler: /usr/bin/clang + cmake_cxx_compiler: /usr/bin/clang++ + cmake_cxx_compiler_id: Clang + cmake_cxx_compiler_version: 14.0.0 + cmake_crosscompiling: TRUE + cmake_cxx_flags_debug: -g + cmake_cxx_flags_release: -O3 -DNDEBUG + cmake_build_type: Release + cmake_toolchain_file: c6g +target info: + target_name: arm64v8_neon_intrinsic_1024_uf1_falp + target_compile_options: -O3 diff --git a/publication/results/c6g/fallback_scalar_aav_1024_uf1_falp.csv b/publication/results/c6g/fallback_scalar_aav_1024_uf1_falp.csv new file mode 100644 index 0000000..263e420 --- /dev/null +++ b/publication/results/c6g/fallback_scalar_aav_1024_uf1_falp.csv @@ -0,0 +1,63 @@ +benchmark_number,name,iterations,cycles_per_tuple +0,air_sensor_f_fused,3000000,5.52566 +0,air_sensor_f,3000000,5.26645 +1,arade4_fused,3000000,3.4605 +1,arade4,3000000,4.63257 +2,basel_temp_f_fused,3000000,3.68883 +2,basel_temp_f,3000000,4.73434 +3,basel_wind_f_fused,3000000,3.60715 +3,basel_wind_f,3000000,4.76232 +4,bird_migration_f_fused,3000000,3.55154 +4,bird_migration_f,3000000,4.55979 +5,bitcoin_f_fused,3000000,3.64933 +5,bitcoin_f,3000000,4.74779 +6,bitcoin_transactions_f_fused,3000000,3.68762 +6,bitcoin_transactions_f,3000000,4.75721 +7,city_temperature_f_fused,3000000,3.53057 +7,city_temperature_f,3000000,4.52509 +8,cms1_fused,3000000,3.78906 +8,cms1,3000000,4.91872 +9,cms9_fused,3000000,3.53061 +9,cms9,3000000,4.51218 +10,cms25_fused,3000000,3.7403 +10,cms25,3000000,4.87706 +11,food_prices_fused,3000000,3.63079 +11,food_prices,3000000,4.69638 +12,gov10_fused,3000000,3.65348 +12,gov10,3000000,4.88116 +13,gov26_fused,3000000,0.501231 +13,gov26,3000000,3.88871 +14,gov30_fused,3000000,0.509899 +14,gov30,3000000,3.91109 +15,gov31_fused,3000000,0.503698 +15,gov31,3000000,3.95455 +16,gov40_fused,3000000,0.505071 +16,gov40,3000000,3.94847 +17,medicare1_fused,3000000,3.74021 +17,medicare1,3000000,4.93967 +18,medicare9_fused,3000000,3.50735 +18,medicare9,3000000,4.50948 +19,neon_air_pressure_fused,3000000,3.23415 +19,neon_air_pressure,3000000,4.30181 +20,neon_bio_temp_c_fused,3000000,3.51982 +20,neon_bio_temp_c,3000000,4.50525 +21,neon_dew_point_temp_fused,3000000,3.57973 +21,neon_dew_point_temp,3000000,4.58511 +22,neon_pm10_dust_fused,3000000,3.35528 +22,neon_pm10_dust,3000000,4.39049 +23,neon_wind_dir_fused,3000000,3.21794 +23,neon_wind_dir,3000000,4.31902 +24,nyc29_fused,3000000,3.73152 +24,nyc29,3000000,4.87266 +25,poi_lat_fused,3000000,5.46704 +25,poi_lat,3000000,5.39144 +26,poi_lon_fused,3000000,4.01029 +26,poi_lon,3000000,5.13161 +27,ssd_hdd_benchmarks_f_fused,3000000,3.54425 +27,ssd_hdd_benchmarks_f,3000000,4.54575 +28,stocks_de_fused,3000000,3.51926 +28,stocks_de,3000000,4.52906 +29,stocks_uk_fused,3000000,3.51055 +29,stocks_uk,3000000,4.50984 +30,stocks_usa_c_fused,3000000,3.52935 +30,stocks_usa_c,3000000,4.524 diff --git a/publication/results/c6g/fallback_scalar_aav_1024_uf1_falp.metadata b/publication/results/c6g/fallback_scalar_aav_1024_uf1_falp.metadata new file mode 100644 index 0000000..10ee073 --- /dev/null +++ b/publication/results/c6g/fallback_scalar_aav_1024_uf1_falp.metadata @@ -0,0 +1,27 @@ +2023-03-30T19:11:37+00:00 +Run on (1 X 243.75 MHz CPU ) +CPU Caches: + L1 Data 64 KiB (x1) + L1 Instruction 64 KiB (x1) + L2 Unified 1024 KiB (x1) + L3 Unified 32768 KiB (x1) +Load Average: 0.20, 0.53, 0.46 +cmake info: + source_dir: /home/ubuntu/bench_alp + cmake_osx_architectures + cmake_host_system_processor: aarch64 + cmake_system_processor: arm64 + cmake_host_system_name: Linux + cmake_system_name: Linux + cmake_c_compiler: /usr/bin/clang + cmake_cxx_compiler: /usr/bin/clang++ + cmake_cxx_compiler_id: Clang + cmake_cxx_compiler_version: 14.0.0 + cmake_crosscompiling: TRUE + cmake_cxx_flags_debug: -g + cmake_cxx_flags_release: -O3 -DNDEBUG + cmake_build_type: Release + cmake_toolchain_file: c6g +target info: + target_name: fallback_scalar_aav_1024_uf1_falp + target_compile_options: -O3 diff --git a/publication/results/c6g/fallback_scalar_nav_1024_uf1_falp.csv b/publication/results/c6g/fallback_scalar_nav_1024_uf1_falp.csv new file mode 100644 index 0000000..a83d1ae --- /dev/null +++ b/publication/results/c6g/fallback_scalar_nav_1024_uf1_falp.csv @@ -0,0 +1,63 @@ +benchmark_number,name,iterations,cycles_per_tuple +0,air_sensor_f_fused,3000000,5.52568 +0,air_sensor_f,3000000,5.26674 +1,arade4_fused,3000000,3.46043 +1,arade4,3000000,4.63173 +2,basel_temp_f_fused,3000000,3.6888 +2,basel_temp_f,3000000,4.73481 +3,basel_wind_f_fused,3000000,3.60709 +3,basel_wind_f,3000000,4.76619 +4,bird_migration_f_fused,3000000,3.5516 +4,bird_migration_f,3000000,4.55978 +5,bitcoin_f_fused,3000000,3.64943 +5,bitcoin_f,3000000,4.7477 +6,bitcoin_transactions_f_fused,3000000,3.68778 +6,bitcoin_transactions_f,3000000,4.7569 +7,city_temperature_f_fused,3000000,3.53057 +7,city_temperature_f,3000000,4.52499 +8,cms1_fused,3000000,3.78867 +8,cms1,3000000,4.91776 +9,cms9_fused,3000000,3.53025 +9,cms9,3000000,4.51316 +10,cms25_fused,3000000,3.74043 +10,cms25,3000000,4.87941 +11,food_prices_fused,3000000,3.63077 +11,food_prices,3000000,4.69626 +12,gov10_fused,3000000,3.6539 +12,gov10,3000000,4.87907 +13,gov26_fused,3000000,0.501279 +13,gov26,3000000,3.8889 +14,gov30_fused,3000000,0.5099 +14,gov30,3000000,3.91047 +15,gov31_fused,3000000,0.503659 +15,gov31,3000000,3.95134 +16,gov40_fused,3000000,0.505107 +16,gov40,3000000,3.95097 +17,medicare1_fused,3000000,3.74031 +17,medicare1,3000000,4.93978 +18,medicare9_fused,3000000,3.50835 +18,medicare9,3000000,4.50956 +19,neon_air_pressure_fused,3000000,3.235 +19,neon_air_pressure,3000000,4.30168 +20,neon_bio_temp_c_fused,3000000,3.51982 +20,neon_bio_temp_c,3000000,4.50558 +21,neon_dew_point_temp_fused,3000000,3.57966 +21,neon_dew_point_temp,3000000,4.58698 +22,neon_pm10_dust_fused,3000000,3.35551 +22,neon_pm10_dust,3000000,4.39119 +23,neon_wind_dir_fused,3000000,3.21796 +23,neon_wind_dir,3000000,4.31369 +24,nyc29_fused,3000000,3.73131 +24,nyc29,3000000,4.87154 +25,poi_lat_fused,3000000,5.46704 +25,poi_lat,3000000,5.39154 +26,poi_lon_fused,3000000,4.011 +26,poi_lon,3000000,5.13152 +27,ssd_hdd_benchmarks_f_fused,3000000,3.54425 +27,ssd_hdd_benchmarks_f,3000000,4.54601 +28,stocks_de_fused,3000000,3.51934 +28,stocks_de,3000000,4.52888 +29,stocks_uk_fused,3000000,3.51056 +29,stocks_uk,3000000,4.5101 +30,stocks_usa_c_fused,3000000,3.52935 +30,stocks_usa_c,3000000,4.52406 diff --git a/publication/results/c6g/fallback_scalar_nav_1024_uf1_falp.metadata b/publication/results/c6g/fallback_scalar_nav_1024_uf1_falp.metadata new file mode 100644 index 0000000..760efbf --- /dev/null +++ b/publication/results/c6g/fallback_scalar_nav_1024_uf1_falp.metadata @@ -0,0 +1,27 @@ +2023-03-30T19:04:27+00:00 +Run on (1 X 243.75 MHz CPU ) +CPU Caches: + L1 Data 64 KiB (x1) + L1 Instruction 64 KiB (x1) + L2 Unified 1024 KiB (x1) + L3 Unified 32768 KiB (x1) +Load Average: 0.12, 0.39, 0.32 +cmake info: + source_dir: /home/ubuntu/bench_alp + cmake_osx_architectures + cmake_host_system_processor: aarch64 + cmake_system_processor: arm64 + cmake_host_system_name: Linux + cmake_system_name: Linux + cmake_c_compiler: /usr/bin/clang + cmake_cxx_compiler: /usr/bin/clang++ + cmake_cxx_compiler_id: Clang + cmake_cxx_compiler_version: 14.0.0 + cmake_crosscompiling: TRUE + cmake_cxx_flags_debug: -g + cmake_cxx_flags_release: -O3 -DNDEBUG + cmake_build_type: Release + cmake_toolchain_file: c6g +target info: + target_name: fallback_scalar_nav_1024_uf1_falp + target_compile_options: -O3 diff --git a/publication/results/c7g/README.md b/publication/results/c7g/README.md new file mode 100644 index 0000000..35c8ac3 --- /dev/null +++ b/publication/results/c7g/README.md @@ -0,0 +1,47 @@ +# Info +[c7g](https://aws.amazon.com/ec2/instance-types/cgg/): + +--- + +- `lscpu` + - ```shell + ubuntu@ip-172-31-61-205:~$ lscpu + Architecture: aarch64 + CPU op-mode(s): 32-bit, 64-bit + Byte Order: Little Endian + CPU(s): 1 + On-line CPU(s) list: 0 + Vendor ID: ARM + Model: 1 + Thread(s) per core: 1 + Core(s) per socket: 1 + Socket(s): 1 + Stepping: r1p1 + BogoMIPS: 2100.00 + Flags: fp asimd evtstrm aes pmull sha1 sha2 crc32 atomics fphp asimdhp cpuid asimdrdm jscvt fcma lrcpc dcpop sha3 sm3 sm4 asimddp sha512 sve asimdfhm dit uscat ilrcpc flagm ssbs pac + a pacg dcpodp svei8mm svebf16 i8mm bf16 dgh rng + Caches (sum of all): + L1d: 64 KiB (1 instance) + L1i: 64 KiB (1 instance) + L2: 1 MiB (1 instance) + L3: 32 MiB (1 instance) + NUMA: + NUMA node(s): 1 + NUMA node0 CPU(s): 0 + Vulnerabilities: + Itlb multihit: Not affected + L1tf: Not affected + Mds: Not affected + Meltdown: Not affected + Mmio stale data: Not affected + Retbleed: Not affected + Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl + Spectre v1: Mitigation; __user pointer sanitization + Spectre v2: Mitigation; CSV2, BHB + Srbds: Not affected + Tsx async abort: Not affected + ``` +--- +## History + +-- \ No newline at end of file diff --git a/publication/results/c7g/arm64v8_neon_intrinsic_1024_uf1_falp.csv b/publication/results/c7g/arm64v8_neon_intrinsic_1024_uf1_falp.csv new file mode 100644 index 0000000..0ad9171 --- /dev/null +++ b/publication/results/c7g/arm64v8_neon_intrinsic_1024_uf1_falp.csv @@ -0,0 +1,63 @@ +benchmark_number,name,iterations,cycles_per_tuple +0,air_sensor_f_fused,3000000,2.72792 +0,air_sensor_f,3000000,3.44459 +1,arade4_fused,3000000,1.88563 +1,arade4,3000000,2.90725 +2,basel_temp_f_fused,3000000,1.99064 +2,basel_temp_f,3000000,3.10304 +3,basel_wind_f_fused,3000000,1.95745 +3,basel_wind_f,3000000,3.15712 +4,bird_migration_f_fused,3000000,1.89942 +4,bird_migration_f,3000000,2.92685 +5,bitcoin_f_fused,3000000,1.98389 +5,bitcoin_f,3000000,3.1084 +6,bitcoin_transactions_f_fused,3000000,1.97489 +6,bitcoin_transactions_f,3000000,3.1421 +7,city_temperature_f_fused,3000000,1.88387 +7,city_temperature_f,3000000,2.94823 +8,cms1_fused,3000000,2.06552 +8,cms1,3000000,3.00479 +9,cms9_fused,3000000,1.85955 +9,cms9,3000000,2.90213 +10,cms25_fused,3000000,2.0502 +10,cms25,3000000,3.21384 +11,food_prices_fused,3000000,1.97329 +11,food_prices,3000000,3.02804 +12,gov10_fused,3000000,1.99557 +12,gov10,3000000,3.22045 +13,gov26_fused,3000000,0.505515 +13,gov26,3000000,2.73372 +14,gov30_fused,3000000,0.505863 +14,gov30,3000000,2.73381 +15,gov31_fused,3000000,0.505516 +15,gov31,3000000,2.73433 +16,gov40_fused,3000000,0.605441 +16,gov40,3000000,2.73329 +17,medicare1_fused,3000000,2.05954 +17,medicare1,3000000,3.22594 +18,medicare9_fused,3000000,1.86403 +18,medicare9,3000000,2.89009 +19,neon_air_pressure_fused,3000000,1.70679 +19,neon_air_pressure,3000000,2.95267 +20,neon_bio_temp_c_fused,3000000,1.86305 +20,neon_bio_temp_c,3000000,2.89873 +21,neon_dew_point_temp_fused,3000000,1.89596 +21,neon_dew_point_temp,3000000,2.96777 +22,neon_pm10_dust_fused,3000000,1.77815 +22,neon_pm10_dust,3000000,2.86005 +23,neon_wind_dir_fused,3000000,1.7032 +23,neon_wind_dir,3000000,2.95235 +24,nyc29_fused,3000000,2.05013 +24,nyc29,3000000,3.2178 +25,poi_lat_fused,3000000,2.70987 +25,poi_lat,3000000,3.57097 +26,poi_lon_fused,3000000,2.25422 +26,poi_lon,3000000,3.41019 +27,ssd_hdd_benchmarks_f_fused,3000000,1.90319 +27,ssd_hdd_benchmarks_f,3000000,2.93572 +28,stocks_de_fused,3000000,1.86481 +28,stocks_de,3000000,2.91186 +29,stocks_uk_fused,3000000,1.82858 +29,stocks_uk,3000000,2.84442 +30,stocks_usa_c_fused,3000000,1.84542 +30,stocks_usa_c,3000000,2.71079 diff --git a/publication/results/c7g/arm64v8_neon_intrinsic_1024_uf1_falp.metadata b/publication/results/c7g/arm64v8_neon_intrinsic_1024_uf1_falp.metadata new file mode 100644 index 0000000..fd4271e --- /dev/null +++ b/publication/results/c7g/arm64v8_neon_intrinsic_1024_uf1_falp.metadata @@ -0,0 +1,27 @@ +2023-03-30T19:37:15+00:00 +Run on (1 X 2100 MHz CPU ) +CPU Caches: + L1 Data 64 KiB (x1) + L1 Instruction 64 KiB (x1) + L2 Unified 1024 KiB (x1) + L3 Unified 32768 KiB (x1) +Load Average: 0.21, 0.34, 0.18 +cmake info: + source_dir: /tmp/tmp.bgNil5yhhb + cmake_osx_architectures + cmake_host_system_processor: aarch64 + cmake_system_processor: arm64 + cmake_host_system_name: Linux + cmake_system_name: Linux + cmake_c_compiler: /usr/bin/clang + cmake_cxx_compiler: /usr/bin/clang++ + cmake_cxx_compiler_id: Clang + cmake_cxx_compiler_version: 14.0.0 + cmake_crosscompiling: TRUE + cmake_cxx_flags_debug: -g + cmake_cxx_flags_release: -O3 -DNDEBUG + cmake_build_type: Release + cmake_toolchain_file: c7g +target info: + target_name: arm64v8_neon_intrinsic_1024_uf1_falp + target_compile_options: -O3 diff --git a/publication/results/c7g/fallback_scalar_aav_1024_uf1_falp.csv b/publication/results/c7g/fallback_scalar_aav_1024_uf1_falp.csv new file mode 100644 index 0000000..f2119a7 --- /dev/null +++ b/publication/results/c7g/fallback_scalar_aav_1024_uf1_falp.csv @@ -0,0 +1,63 @@ +benchmark_number,name,iterations,cycles_per_tuple +0,air_sensor_f_fused,3000000,2.66433 +0,air_sensor_f,3000000,3.44197 +1,arade4_fused,3000000,1.88429 +1,arade4,3000000,2.92618 +2,basel_temp_f_fused,3000000,1.98998 +2,basel_temp_f,3000000,3.11813 +3,basel_wind_f_fused,3000000,1.95222 +3,basel_wind_f,3000000,3.16546 +4,bird_migration_f_fused,3000000,1.90272 +4,bird_migration_f,3000000,2.96381 +5,bitcoin_f_fused,3000000,1.9835 +5,bitcoin_f,3000000,2.98909 +6,bitcoin_transactions_f_fused,3000000,1.97428 +6,bitcoin_transactions_f,3000000,2.91731 +7,city_temperature_f_fused,3000000,1.88198 +7,city_temperature_f,3000000,2.96222 +8,cms1_fused,3000000,2.06501 +8,cms1,3000000,3.25438 +9,cms9_fused,3000000,1.86085 +9,cms9,3000000,2.92978 +10,cms25_fused,3000000,2.05023 +10,cms25,3000000,3.19882 +11,food_prices_fused,3000000,1.96032 +11,food_prices,3000000,3.13331 +12,gov10_fused,3000000,1.99473 +12,gov10,3000000,3.22778 +13,gov26_fused,3000000,0.50002 +13,gov26,3000000,2.73008 +14,gov30_fused,3000000,0.503929 +14,gov30,3000000,2.73664 +15,gov31_fused,3000000,0.500996 +15,gov31,3000000,2.73223 +16,gov40_fused,3000000,0.603468 +16,gov40,3000000,2.73242 +17,medicare1_fused,3000000,2.06069 +17,medicare1,3000000,3.22677 +18,medicare9_fused,3000000,1.86313 +18,medicare9,3000000,2.94463 +19,neon_air_pressure_fused,3000000,1.70609 +19,neon_air_pressure,3000000,2.83451 +20,neon_bio_temp_c_fused,3000000,1.86531 +20,neon_bio_temp_c,3000000,2.92486 +21,neon_dew_point_temp_fused,3000000,1.89772 +21,neon_dew_point_temp,3000000,2.95051 +22,neon_pm10_dust_fused,3000000,1.77835 +22,neon_pm10_dust,3000000,2.8355 +23,neon_wind_dir_fused,3000000,1.70136 +23,neon_wind_dir,3000000,2.83443 +24,nyc29_fused,3000000,2.04937 +24,nyc29,3000000,3.19534 +25,poi_lat_fused,3000000,2.74405 +25,poi_lat,3000000,3.57005 +26,poi_lon_fused,3000000,2.25463 +26,poi_lon,3000000,3.45143 +27,ssd_hdd_benchmarks_f_fused,3000000,1.90057 +27,ssd_hdd_benchmarks_f,3000000,2.93408 +28,stocks_de_fused,3000000,1.8648 +28,stocks_de,3000000,2.93106 +29,stocks_uk_fused,3000000,1.82687 +29,stocks_uk,3000000,2.85658 +30,stocks_usa_c_fused,3000000,1.84392 +30,stocks_usa_c,3000000,2.90866 diff --git a/publication/results/c7g/fallback_scalar_aav_1024_uf1_falp.metadata b/publication/results/c7g/fallback_scalar_aav_1024_uf1_falp.metadata new file mode 100644 index 0000000..f21755a --- /dev/null +++ b/publication/results/c7g/fallback_scalar_aav_1024_uf1_falp.metadata @@ -0,0 +1,27 @@ +2023-03-30T19:46:50+00:00 +Run on (1 X 2100 MHz CPU ) +CPU Caches: + L1 Data 64 KiB (x1) + L1 Instruction 64 KiB (x1) + L2 Unified 1024 KiB (x1) + L3 Unified 32768 KiB (x1) +Load Average: 0.14, 0.48, 0.37 +cmake info: + source_dir: /tmp/tmp.bgNil5yhhb + cmake_osx_architectures + cmake_host_system_processor: aarch64 + cmake_system_processor: arm64 + cmake_host_system_name: Linux + cmake_system_name: Linux + cmake_c_compiler: /usr/bin/clang + cmake_cxx_compiler: /usr/bin/clang++ + cmake_cxx_compiler_id: Clang + cmake_cxx_compiler_version: 14.0.0 + cmake_crosscompiling: TRUE + cmake_cxx_flags_debug: -g + cmake_cxx_flags_release: -O3 -DNDEBUG + cmake_build_type: Release + cmake_toolchain_file: c7g +target info: + target_name: fallback_scalar_aav_1024_uf1_falp + target_compile_options: -O3 diff --git a/publication/results/c7g/fallback_scalar_nav_1024_uf1_falp.csv b/publication/results/c7g/fallback_scalar_nav_1024_uf1_falp.csv new file mode 100644 index 0000000..e5fea6e --- /dev/null +++ b/publication/results/c7g/fallback_scalar_nav_1024_uf1_falp.csv @@ -0,0 +1,63 @@ +benchmark_number,name,iterations,cycles_per_tuple +0,air_sensor_f_fused,3000000,2.66512 +0,air_sensor_f,3000000,3.44832 +1,arade4_fused,3000000,1.88421 +1,arade4,3000000,2.92449 +2,basel_temp_f_fused,3000000,1.99014 +2,basel_temp_f,3000000,3.12212 +3,basel_wind_f_fused,3000000,1.95842 +3,basel_wind_f,3000000,3.16644 +4,bird_migration_f_fused,3000000,1.90233 +4,bird_migration_f,3000000,2.94014 +5,bitcoin_f_fused,3000000,1.984 +5,bitcoin_f,3000000,2.99084 +6,bitcoin_transactions_f_fused,3000000,1.9744 +6,bitcoin_transactions_f,3000000,2.9195 +7,city_temperature_f_fused,3000000,1.88194 +7,city_temperature_f,3000000,2.95845 +8,cms1_fused,3000000,2.06703 +8,cms1,3000000,3.26291 +9,cms9_fused,3000000,1.85947 +9,cms9,3000000,2.9275 +10,cms25_fused,3000000,2.05029 +10,cms25,3000000,3.19439 +11,food_prices_fused,3000000,1.96382 +11,food_prices,3000000,3.14076 +12,gov10_fused,3000000,1.99476 +12,gov10,3000000,3.23388 +13,gov26_fused,3000000,0.500021 +13,gov26,3000000,2.73015 +14,gov30_fused,3000000,0.503933 +14,gov30,3000000,2.73613 +15,gov31_fused,3000000,0.500997 +15,gov31,3000000,2.73255 +16,gov40_fused,3000000,0.603501 +16,gov40,3000000,2.73238 +17,medicare1_fused,3000000,2.06062 +17,medicare1,3000000,3.22619 +18,medicare9_fused,3000000,1.86285 +18,medicare9,3000000,2.93905 +19,neon_air_pressure_fused,3000000,1.70606 +19,neon_air_pressure,3000000,2.83726 +20,neon_bio_temp_c_fused,3000000,1.86583 +20,neon_bio_temp_c,3000000,2.92049 +21,neon_dew_point_temp_fused,3000000,1.8975 +21,neon_dew_point_temp,3000000,2.9535 +22,neon_pm10_dust_fused,3000000,1.77831 +22,neon_pm10_dust,3000000,2.84092 +23,neon_wind_dir_fused,3000000,1.70133 +23,neon_wind_dir,3000000,2.82423 +24,nyc29_fused,3000000,2.0494 +24,nyc29,3000000,3.1862 +25,poi_lat_fused,3000000,2.74181 +25,poi_lat,3000000,3.57259 +26,poi_lon_fused,3000000,2.25432 +26,poi_lon,3000000,3.45185 +27,ssd_hdd_benchmarks_f_fused,3000000,1.90272 +27,ssd_hdd_benchmarks_f,3000000,2.95313 +28,stocks_de_fused,3000000,1.86574 +28,stocks_de,3000000,2.92497 +29,stocks_uk_fused,3000000,1.82689 +29,stocks_uk,3000000,2.86139 +30,stocks_usa_c_fused,3000000,1.84367 +30,stocks_usa_c,3000000,2.91015 diff --git a/publication/results/c7g/fallback_scalar_nav_1024_uf1_falp.metadata b/publication/results/c7g/fallback_scalar_nav_1024_uf1_falp.metadata new file mode 100644 index 0000000..2ffd658 --- /dev/null +++ b/publication/results/c7g/fallback_scalar_nav_1024_uf1_falp.metadata @@ -0,0 +1,27 @@ +2023-03-30T19:41:57+00:00 +Run on (1 X 2100 MHz CPU ) +CPU Caches: + L1 Data 64 KiB (x1) + L1 Instruction 64 KiB (x1) + L2 Unified 1024 KiB (x1) + L3 Unified 32768 KiB (x1) +Load Average: 0.25, 0.46, 0.29 +cmake info: + source_dir: /tmp/tmp.bgNil5yhhb + cmake_osx_architectures + cmake_host_system_processor: aarch64 + cmake_system_processor: arm64 + cmake_host_system_name: Linux + cmake_system_name: Linux + cmake_c_compiler: /usr/bin/clang + cmake_cxx_compiler: /usr/bin/clang++ + cmake_cxx_compiler_id: Clang + cmake_cxx_compiler_version: 14.0.0 + cmake_crosscompiling: TRUE + cmake_cxx_flags_debug: -g + cmake_cxx_flags_release: -O3 -DNDEBUG + cmake_build_type: Release + cmake_toolchain_file: c7g +target info: + target_name: fallback_scalar_nav_1024_uf1_falp + target_compile_options: -O3 diff --git a/publication/results/i4i/README.md b/publication/results/i4i/README.md new file mode 100644 index 0000000..6be420f --- /dev/null +++ b/publication/results/i4i/README.md @@ -0,0 +1,8 @@ +# Info +[i4i](https://aws.amazon.com/ec2/instance-types/i4i/): +- I4i instances are powered by the latest generation Intel Xeon Scalable (Ice Lake) Processors with an all-core turbo frequency of 3.5 GHz. + + +--- +## History +1. [link](https://www.cpubenchmark.net/cpu.php?cpu=Intel+Xeon+Platinum+8375C+%40+2.90GHz&id=4486) \ No newline at end of file diff --git a/publication/results/i4i/alp_decode_cutter.csv b/publication/results/i4i/alp_decode_cutter.csv new file mode 100644 index 0000000..89258b5 --- /dev/null +++ b/publication/results/i4i/alp_decode_cutter.csv @@ -0,0 +1,4 @@ +benchmark_number,name,iterations,cycles_per_tuple +0,air_sensor_f_encode,300000,1.66781 +25,poi_lat_encode,300000,1.74289 +26,poi_lon_encode,300000,1.69417 diff --git a/publication/results/i4i/alp_decode_cutter.metadata b/publication/results/i4i/alp_decode_cutter.metadata new file mode 100644 index 0000000..ddbb63f --- /dev/null +++ b/publication/results/i4i/alp_decode_cutter.metadata @@ -0,0 +1,27 @@ +2023-04-11T23:54:01+00:00 +Run on (2 X 3499.4 MHz CPU s) +CPU Caches: + L1 Data 48 KiB (x1) + L1 Instruction 32 KiB (x1) + L2 Unified 1280 KiB (x1) + L3 Unified 55296 KiB (x1) +Load Average: 0.04, 0.06, 0.15 +cmake info: + source_dir: /home/ubuntu/bench_ALP + cmake_osx_architectures + cmake_host_system_processor: x86_64 + cmake_system_processor: x86_64 + cmake_host_system_name: Linux + cmake_system_name: Linux + cmake_c_compiler: /usr/bin/clang + cmake_cxx_compiler: /usr/bin/clang++ + cmake_cxx_compiler_id: Clang + cmake_cxx_compiler_version: 14.0.0 + cmake_crosscompiling: TRUE + cmake_cxx_flags_debug: -g + cmake_cxx_flags_release: -O3 -DNDEBUG + cmake_build_type: Release + cmake_toolchain_file: i4i +target info: + target_name: + target_compile_options: diff --git a/publication/results/i4i/alp_encode.csv b/publication/results/i4i/alp_encode.csv new file mode 100644 index 0000000..6318089 --- /dev/null +++ b/publication/results/i4i/alp_encode.csv @@ -0,0 +1,29 @@ +benchmark_number,name,iterations,cycles_per_tuple +1,arade4_encode,300000,2.00765 +2,basel_temp_f_encode,300000,2.13862 +3,basel_wind_f_encode,300000,2.07267 +4,bird_migration_f_encode,300000,2.00562 +5,bitcoin_f_encode,300000,2.04599 +6,bitcoin_transactions_f_encode,300000,2.04432 +7,city_temperature_f_encode,300000,1.98136 +8,cms1_encode,300000,2.11394 +9,cms9_encode,300000,1.97176 +10,cms25_encode,300000,2.05183 +11,food_prices_encode,300000,2.111 +12,gov10_encode,300000,2.34254 +13,gov26_encode,300000,1.6187 +14,gov30_encode,300000,1.62877 +15,gov31_encode,300000,1.61745 +16,gov40_encode,300000,1.61506 +17,medicare1_encode,300000,2.15331 +18,medicare9_encode,300000,1.97265 +19,neon_air_pressure_encode,300000,1.93896 +20,neon_bio_temp_c_encode,300000,1.96332 +21,neon_dew_point_temp_encode,300000,2.0032 +22,neon_pm10_dust_encode,300000,1.88541 +23,neon_wind_dir_encode,300000,1.92846 +24,nyc29_encode,300000,2.05096 +27,ssd_hdd_benchmarks_f_encode,300000,2.00583 +28,stocks_de_encode,300000,1.97255 +29,stocks_uk_encode,300000,1.98069 +30,stocks_usa_c_encode,300000,1.97838 diff --git a/publication/results/i4i/alp_encode.metadata b/publication/results/i4i/alp_encode.metadata new file mode 100644 index 0000000..1a5c2a2 --- /dev/null +++ b/publication/results/i4i/alp_encode.metadata @@ -0,0 +1,27 @@ +2023-04-11T20:54:00+00:00 +Run on (2 X 3499.15 MHz CPU s) +CPU Caches: + L1 Data 48 KiB (x1) + L1 Instruction 32 KiB (x1) + L2 Unified 1280 KiB (x1) + L3 Unified 55296 KiB (x1) +Load Average: 0.11, 0.07, 0.07 +cmake info: + source_dir: /home/ubuntu/bench_ALP + cmake_osx_architectures + cmake_host_system_processor: x86_64 + cmake_system_processor: x86_64 + cmake_host_system_name: Linux + cmake_system_name: Linux + cmake_c_compiler: /usr/bin/clang + cmake_cxx_compiler: /usr/bin/clang++ + cmake_cxx_compiler_id: Clang + cmake_cxx_compiler_version: 14.0.0 + cmake_crosscompiling: TRUE + cmake_cxx_flags_debug: -g + cmake_cxx_flags_release: -O3 -DNDEBUG + cmake_build_type: Release + cmake_toolchain_file: i4i +target info: + target_name: + target_compile_options: diff --git a/publication/results/i4i/alp_encode_cutter.csv b/publication/results/i4i/alp_encode_cutter.csv new file mode 100644 index 0000000..a85a104 --- /dev/null +++ b/publication/results/i4i/alp_encode_cutter.csv @@ -0,0 +1,4 @@ +benchmark_number,name,iterations,cycles_per_tuple +0,air_sensor_f_encode,300000,8.33457 +25,poi_lat_encode,300000,7.04858 +26,poi_lon_encode,300000,6.34115 diff --git a/publication/results/i4i/alp_encode_cutter.metadata b/publication/results/i4i/alp_encode_cutter.metadata new file mode 100644 index 0000000..66dcdd5 --- /dev/null +++ b/publication/results/i4i/alp_encode_cutter.metadata @@ -0,0 +1,27 @@ +2023-04-11T21:46:01+00:00 +Run on (2 X 2899.97 MHz CPU s) +CPU Caches: + L1 Data 48 KiB (x1) + L1 Instruction 32 KiB (x1) + L2 Unified 1280 KiB (x1) + L3 Unified 55296 KiB (x1) +Load Average: 0.08, 0.03, 0.00 +cmake info: + source_dir: /home/ubuntu/bench_ALP + cmake_osx_architectures + cmake_host_system_processor: x86_64 + cmake_system_processor: x86_64 + cmake_host_system_name: Linux + cmake_system_name: Linux + cmake_c_compiler: /usr/bin/clang + cmake_cxx_compiler: /usr/bin/clang++ + cmake_cxx_compiler_id: Clang + cmake_cxx_compiler_version: 14.0.0 + cmake_crosscompiling: TRUE + cmake_cxx_flags_debug: -g + cmake_cxx_flags_release: -O3 -DNDEBUG + cmake_build_type: Release + cmake_toolchain_file: i4i +target info: + target_name: + target_compile_options: diff --git a/publication/results/i4i/alp_encode_pde.csv b/publication/results/i4i/alp_encode_pde.csv new file mode 100644 index 0000000..506cd03 --- /dev/null +++ b/publication/results/i4i/alp_encode_pde.csv @@ -0,0 +1,29 @@ +benchmark_number,name,iterations,cycles_per_tuple +1,arade4_encode,300000,2.42748 +2,basel_temp_f_encode,300000,2.56604 +3,basel_wind_f_encode,300000,2.51426 +4,bird_migration_f_encode,300000,2.43341 +5,bitcoin_f_encode,300000,2.50302 +6,bitcoin_transactions_f_encode,300000,2.44147 +7,city_temperature_f_encode,300000,2.35965 +8,cms1_encode,300000,2.55548 +9,cms9_encode,300000,2.36876 +10,cms25_encode,300000,2.46491 +11,food_prices_encode,300000,2.54094 +12,gov10_encode,300000,2.75918 +13,gov26_encode,300000,1.93522 +14,gov30_encode,300000,1.98955 +15,gov31_encode,300000,1.97655 +16,gov40_encode,300000,1.98246 +17,medicare1_encode,300000,2.61874 +18,medicare9_encode,300000,2.37824 +19,neon_air_pressure_encode,300000,2.3322 +20,neon_bio_temp_c_encode,300000,2.32026 +21,neon_dew_point_temp_encode,300000,2.43697 +22,neon_pm10_dust_encode,300000,2.23132 +23,neon_wind_dir_encode,300000,2.27445 +24,nyc29_encode,300000,2.46386 +27,ssd_hdd_benchmarks_f_encode,300000,2.39474 +28,stocks_de_encode,300000,2.37429 +29,stocks_uk_encode,300000,2.36125 +30,stocks_usa_c_encode,300000,2.36252 diff --git a/publication/results/i4i/alp_encode_pde.metadata b/publication/results/i4i/alp_encode_pde.metadata new file mode 100644 index 0000000..066a141 --- /dev/null +++ b/publication/results/i4i/alp_encode_pde.metadata @@ -0,0 +1,27 @@ +2023-04-11T23:17:33+00:00 +Run on (2 X 3499.22 MHz CPU s) +CPU Caches: + L1 Data 48 KiB (x1) + L1 Instruction 32 KiB (x1) + L2 Unified 1280 KiB (x1) + L3 Unified 55296 KiB (x1) +Load Average: 0.22, 0.18, 0.10 +cmake info: + source_dir: /home/ubuntu/bench_ALP + cmake_osx_architectures + cmake_host_system_processor: x86_64 + cmake_system_processor: x86_64 + cmake_host_system_name: Linux + cmake_system_name: Linux + cmake_c_compiler: /usr/bin/clang + cmake_cxx_compiler: /usr/bin/clang++ + cmake_cxx_compiler_id: Clang + cmake_cxx_compiler_version: 14.0.0 + cmake_crosscompiling: TRUE + cmake_cxx_flags_debug: -g + cmake_cxx_flags_release: -O3 -DNDEBUG + cmake_build_type: Release + cmake_toolchain_file: i4i +target info: + target_name: + target_compile_options: diff --git a/publication/results/i4i/alp_encode_without_sampling.csv b/publication/results/i4i/alp_encode_without_sampling.csv new file mode 100644 index 0000000..4c316b8 --- /dev/null +++ b/publication/results/i4i/alp_encode_without_sampling.csv @@ -0,0 +1,29 @@ +benchmark_number,name,iterations,cycles_per_tuple +1,arade4_encode_simdized,300000,2.30736 +2,basel_temp_f_encode_simdized,300000,2.74485 +3,basel_wind_f_encode_simdized,300000,2.3573 +4,bird_migration_f_encode_simdized,300000,2.22079 +5,bitcoin_f_encode_simdized,300000,2.34594 +6,bitcoin_transactions_f_encode_simdized,300000,2.35674 +7,city_temperature_f_encode_simdized,300000,2.1915 +8,cms1_encode_simdized,300000,2.42561 +9,cms9_encode_simdized,300000,2.18019 +10,cms25_encode_simdized,300000,2.33086 +11,food_prices_encode_simdized,300000,2.50103 +12,gov10_encode_simdized,300000,2.74566 +13,gov26_encode_simdized,300000,1.79042 +14,gov30_encode_simdized,300000,1.83271 +15,gov31_encode_simdized,300000,1.79123 +16,gov40_encode_simdized,300000,1.80605 +17,medicare1_encode_simdized,300000,2.67781 +18,medicare9_encode_simdized,300000,2.19095 +19,neon_air_pressure_encode_simdized,300000,2.17364 +20,neon_bio_temp_c_encode_simdized,300000,2.16811 +21,neon_dew_point_temp_encode_simdized,300000,2.2736 +22,neon_pm10_dust_encode_simdized,300000,2.09247 +23,neon_wind_dir_encode_simdized,300000,2.12834 +24,nyc29_encode_simdized,300000,2.31336 +27,ssd_hdd_benchmarks_f_encode_simdized,300000,2.20627 +28,stocks_de_encode_simdized,300000,2.21913 +29,stocks_uk_encode_simdized,300000,2.18493 +30,stocks_usa_c_encode_simdized,300000,2.1881 diff --git a/publication/results/i4i/alp_encode_without_sampling.metadata b/publication/results/i4i/alp_encode_without_sampling.metadata new file mode 100644 index 0000000..744d7e7 --- /dev/null +++ b/publication/results/i4i/alp_encode_without_sampling.metadata @@ -0,0 +1,27 @@ +2023-04-11T23:16:47+00:00 +Run on (2 X 3502.43 MHz CPU s) +CPU Caches: + L1 Data 48 KiB (x1) + L1 Instruction 32 KiB (x1) + L2 Unified 1280 KiB (x1) + L3 Unified 55296 KiB (x1) +Load Average: 0.20, 0.17, 0.09 +cmake info: + source_dir: /home/ubuntu/bench_ALP + cmake_osx_architectures + cmake_host_system_processor: x86_64 + cmake_system_processor: x86_64 + cmake_host_system_name: Linux + cmake_system_name: Linux + cmake_c_compiler: /usr/bin/clang + cmake_cxx_compiler: /usr/bin/clang++ + cmake_cxx_compiler_id: Clang + cmake_cxx_compiler_version: 14.0.0 + cmake_crosscompiling: TRUE + cmake_cxx_flags_debug: -g + cmake_cxx_flags_release: -O3 -DNDEBUG + cmake_build_type: Release + cmake_toolchain_file: i4i +target info: + target_name: + target_compile_options: diff --git a/publication/results/i4i/chimp.csv b/publication/results/i4i/chimp.csv new file mode 100644 index 0000000..15c86e3 --- /dev/null +++ b/publication/results/i4i/chimp.csv @@ -0,0 +1,63 @@ +benchmark_number,name,iterations,cycles_per_tuple +0,air_sensor_f_encode,300000,37.1182 +0,air_sensor_f_decode,300000,40.8874 +1,arade4_encode,300000,40.4516 +1,arade4_decode,300000,42.0097 +2,basel_temp_f_encode,300000,40.0175 +2,basel_temp_f_decode,300000,42.2968 +3,basel_wind_f_encode,300000,39.3419 +3,basel_wind_f_decode,300000,41.057 +4,bird_migration_f_encode,300000,30.6485 +4,bird_migration_f_decode,300000,40.2715 +5,bitcoin_f_encode,300000,33.6886 +5,bitcoin_f_decode,300000,40.8915 +6,bitcoin_transactions_f_encode,300000,40.9255 +6,bitcoin_transactions_f_decode,300000,36.2518 +7,city_temperature_f_encode,300000,36.5124 +7,city_temperature_f_decode,300000,41.4059 +8,cms1_encode,300000,39.4016 +8,cms1_decode,300000,43.4837 +9,cms9_encode,300000,28.9921 +9,cms9_decode,300000,23.2551 +10,cms25_encode,300000,42.2945 +10,cms25_decode,300000,44.4238 +11,food_prices_encode,300000,21.2625 +11,food_prices_decode,300000,24.467 +12,gov10_encode,300000,36.277 +12,gov10_decode,300000,37.4671 +13,gov26_encode,300000,8.60201 +13,gov26_decode,300000,8.54531 +14,gov30_encode,300000,9.009 +14,gov30_decode,300000,9.14438 +15,gov31_encode,300000,8.69666 +15,gov31_decode,300000,8.8369 +16,gov40_encode,300000,8.74503 +16,gov40_decode,300000,8.8643 +17,medicare1_encode,300000,32.626 +17,medicare1_decode,300000,38.9576 +18,medicare9_encode,300000,30.1559 +18,medicare9_decode,300000,24.116 +19,neon_air_pressure_encode,300000,30.7237 +19,neon_air_pressure_decode,300000,33.5255 +20,neon_bio_temp_c_encode,300000,34.7282 +20,neon_bio_temp_c_decode,300000,41.2496 +21,neon_dew_point_temp_encode,300000,34.3712 +21,neon_dew_point_temp_decode,300000,39.5259 +22,neon_pm10_dust_encode,300000,28.0917 +22,neon_pm10_dust_decode,300000,35.623 +23,neon_wind_dir_encode,300000,40.2993 +23,neon_wind_dir_decode,300000,42.7487 +24,nyc29_encode,300000,19.2743 +24,nyc29_decode,300000,28.9526 +25,poi_lat_encode,300000,41.6023 +25,poi_lat_decode,300000,41.4973 +26,poi_lon_encode,300000,46.3144 +26,poi_lon_decode,300000,46.136 +27,ssd_hdd_benchmarks_f_encode,300000,35.5111 +27,ssd_hdd_benchmarks_f_decode,300000,38.1932 +28,stocks_de_encode,300000,33.7291 +28,stocks_de_decode,300000,41.9743 +29,stocks_uk_encode,300000,27.0914 +29,stocks_uk_decode,300000,22.6512 +30,stocks_usa_c_encode,300000,23.4359 +30,stocks_usa_c_decode,300000,33.1116 diff --git a/publication/results/i4i/chimp.metadata b/publication/results/i4i/chimp.metadata new file mode 100644 index 0000000..5c785c8 --- /dev/null +++ b/publication/results/i4i/chimp.metadata @@ -0,0 +1,27 @@ +2023-04-01T21:10:58+00:00 +Run on (2 X 3499.71 MHz CPU s) +CPU Caches: + L1 Data 48 KiB (x1) + L1 Instruction 32 KiB (x1) + L2 Unified 1280 KiB (x1) + L3 Unified 55296 KiB (x1) +Load Average: 0.00, 0.13, 0.15 +cmake info: + source_dir: /home/ubuntu/bench_ALP + cmake_osx_architectures + cmake_host_system_processor: x86_64 + cmake_system_processor: x86_64 + cmake_host_system_name: Linux + cmake_system_name: Linux + cmake_c_compiler: /usr/bin/clang + cmake_cxx_compiler: /usr/bin/clang++ + cmake_cxx_compiler_id: Clang + cmake_cxx_compiler_version: 14.0.0 + cmake_crosscompiling: TRUE + cmake_cxx_flags_debug: -g + cmake_cxx_flags_release: -O3 -DNDEBUG + cmake_build_type: Release + cmake_toolchain_file: i4i +target info: + target_name: + target_compile_options: diff --git a/publication/results/i4i/chimp128.csv b/publication/results/i4i/chimp128.csv new file mode 100644 index 0000000..f8f3d09 --- /dev/null +++ b/publication/results/i4i/chimp128.csv @@ -0,0 +1,63 @@ +benchmark_number,name,iterations,cycles_per_tuple +0,air_sensor_f_encode,300000,43.3806 +0,air_sensor_f_decode,300000,42.9797 +1,arade4_encode,300000,44.1109 +1,arade4_decode,300000,44.0856 +2,basel_temp_f_encode,300000,36.9809 +2,basel_temp_f_decode,300000,37.2357 +3,basel_wind_f_encode,300000,39.8262 +3,basel_wind_f_decode,300000,39.6214 +4,bird_migration_f_encode,300000,24.2276 +4,bird_migration_f_decode,300000,24.4812 +5,bitcoin_f_encode,300000,42.591 +5,bitcoin_f_decode,300000,42.0842 +6,bitcoin_transactions_f_encode,300000,39.1148 +6,bitcoin_transactions_f_decode,300000,39.0715 +7,city_temperature_f_encode,300000,23.4144 +7,city_temperature_f_decode,300000,23.4175 +8,cms1_encode,300000,43.5341 +8,cms1_decode,300000,44.0326 +9,cms9_encode,300000,23.5568 +9,cms9_decode,300000,23.5853 +10,cms25_encode,300000,45.4172 +10,cms25_decode,300000,45.8729 +11,food_prices_encode,300000,22.6938 +11,food_prices_decode,300000,22.5042 +12,gov10_encode,300000,29.044 +12,gov10_decode,300000,28.9446 +13,gov26_encode,300000,13.7301 +13,gov26_decode,300000,13.7503 +14,gov30_encode,300000,14.0004 +14,gov30_decode,300000,14.0062 +15,gov31_encode,300000,13.8002 +15,gov31_decode,300000,13.8025 +16,gov40_encode,300000,13.8651 +16,gov40_decode,300000,13.8599 +17,medicare1_encode,300000,32.0368 +17,medicare1_decode,300000,31.7506 +18,medicare9_encode,300000,23.74 +18,medicare9_decode,300000,23.701 +19,neon_air_pressure_encode,300000,34.8336 +19,neon_air_pressure_decode,300000,35.1131 +20,neon_bio_temp_c_encode,300000,24.874 +20,neon_bio_temp_c_decode,300000,25.2467 +21,neon_dew_point_temp_encode,300000,37.7593 +21,neon_dew_point_temp_decode,300000,37.7552 +22,neon_pm10_dust_encode,300000,17.95 +22,neon_pm10_dust_decode,300000,18.0811 +23,neon_wind_dir_encode,300000,24.963 +23,neon_wind_dir_decode,300000,25.0485 +24,nyc29_encode,300000,23.2611 +24,nyc29_decode,300000,23.2675 +25,poi_lat_encode,300000,43.8604 +25,poi_lat_decode,300000,43.646 +26,poi_lon_encode,300000,47.5153 +26,poi_lon_decode,300000,47.2768 +27,ssd_hdd_benchmarks_f_encode,300000,23.262 +27,ssd_hdd_benchmarks_f_decode,300000,23.2874 +28,stocks_de_encode,300000,19.6019 +28,stocks_de_decode,300000,19.5828 +29,stocks_uk_encode,300000,22.4747 +29,stocks_uk_decode,300000,22.4568 +30,stocks_usa_c_encode,300000,16.4777 +30,stocks_usa_c_decode,300000,16.4284 diff --git a/publication/results/i4i/chimp128.metadata b/publication/results/i4i/chimp128.metadata new file mode 100644 index 0000000..e1ce581 --- /dev/null +++ b/publication/results/i4i/chimp128.metadata @@ -0,0 +1,27 @@ +2023-04-11T23:39:53+00:00 +Run on (2 X 3498.27 MHz CPU s) +CPU Caches: + L1 Data 48 KiB (x1) + L1 Instruction 32 KiB (x1) + L2 Unified 1280 KiB (x1) + L3 Unified 55296 KiB (x1) +Load Average: 0.30, 0.21, 0.21 +cmake info: + source_dir: /home/ubuntu/bench_ALP + cmake_osx_architectures + cmake_host_system_processor: x86_64 + cmake_system_processor: x86_64 + cmake_host_system_name: Linux + cmake_system_name: Linux + cmake_c_compiler: /usr/bin/clang + cmake_cxx_compiler: /usr/bin/clang++ + cmake_cxx_compiler_id: Clang + cmake_cxx_compiler_version: 14.0.0 + cmake_crosscompiling: TRUE + cmake_cxx_flags_debug: -g + cmake_cxx_flags_release: -O3 -DNDEBUG + cmake_build_type: Release + cmake_toolchain_file: i4i +target info: + target_name: + target_compile_options: diff --git a/publication/results/i4i/fallback_scalar_aav_1024_uf1_falp.csv b/publication/results/i4i/fallback_scalar_aav_1024_uf1_falp.csv new file mode 100644 index 0000000..397f11c --- /dev/null +++ b/publication/results/i4i/fallback_scalar_aav_1024_uf1_falp.csv @@ -0,0 +1,63 @@ +benchmark_number,name,iterations,cycles_per_tuple +0,air_sensor_f_fused,300000,0.698698 +0,air_sensor_f,300000,1.22077 +1,arade4_fused,300000,0.502839 +1,arade4,300000,0.937631 +2,basel_temp_f_fused,300000,0.610199 +2,basel_temp_f,300000,1.10845 +3,basel_wind_f_fused,300000,0.582147 +3,basel_wind_f,300000,1.04128 +4,bird_migration_f_fused,300000,0.528974 +4,bird_migration_f,300000,1.00338 +5,bitcoin_f_fused,300000,0.665596 +5,bitcoin_f,300000,1.16308 +6,bitcoin_transactions_f_fused,300000,0.676246 +6,bitcoin_transactions_f,300000,1.14415 +7,city_temperature_f_fused,300000,0.485712 +7,city_temperature_f,300000,0.943532 +8,cms1_fused,300000,0.584059 +8,cms1,300000,1.10966 +9,cms9_fused,300000,0.48499 +9,cms9,300000,0.943975 +10,cms25_fused,300000,0.579438 +10,cms25,300000,1.10932 +11,food_prices_fused,300000,0.666548 +11,food_prices,300000,1.1186 +12,gov10_fused,300000,0.665026 +12,gov10,300000,1.14578 +13,gov26_fused,300000,0.253739 +13,gov26,300000,0.840588 +14,gov30_fused,300000,0.547509 +14,gov30,300000,1.04648 +15,gov31_fused,300000,0.25451 +15,gov31,300000,0.840334 +16,gov40_fused,300000,0.505667 +16,gov40,300000,0.998222 +17,medicare1_fused,300000,0.757632 +17,medicare1,300000,1.25311 +18,medicare9_fused,300000,0.485518 +18,medicare9,300000,0.944299 +19,neon_air_pressure_fused,300000,0.612937 +19,neon_air_pressure,300000,1.04112 +20,neon_bio_temp_c_fused,300000,0.48046 +20,neon_bio_temp_c,300000,0.927999 +21,neon_dew_point_temp_fused,300000,0.494133 +21,neon_dew_point_temp,300000,0.961675 +22,neon_pm10_dust_fused,300000,0.451521 +22,neon_pm10_dust,300000,0.870878 +23,neon_wind_dir_fused,300000,0.439133 +23,neon_wind_dir,300000,0.883795 +24,nyc29_fused,300000,0.551528 +24,nyc29,300000,1.10932 +25,poi_lat_fused,300000,0.691147 +25,poi_lat,300000,1.22097 +26,poi_lon_fused,300000,0.583531 +26,poi_lon,300000,1.14444 +27,ssd_hdd_benchmarks_f_fused,300000,0.49768 +27,ssd_hdd_benchmarks_f,300000,0.986739 +28,stocks_de_fused,300000,0.481317 +28,stocks_de,300000,0.934456 +29,stocks_uk_fused,300000,0.483057 +29,stocks_uk,300000,0.940566 +30,stocks_usa_c_fused,300000,0.477826 +30,stocks_usa_c,300000,0.928562 diff --git a/publication/results/i4i/fallback_scalar_aav_1024_uf1_falp.metadata b/publication/results/i4i/fallback_scalar_aav_1024_uf1_falp.metadata new file mode 100644 index 0000000..9ddc555 --- /dev/null +++ b/publication/results/i4i/fallback_scalar_aav_1024_uf1_falp.metadata @@ -0,0 +1,27 @@ +2023-02-26T17:13:27+00:00 +Run on (2 X 3504.35 MHz CPU s) +CPU Caches: + L1 Data 48 KiB (x1) + L1 Instruction 32 KiB (x1) + L2 Unified 1280 KiB (x1) + L3 Unified 55296 KiB (x1) +Load Average: 0.09, 0.19, 0.11 +cmake info: + source_dir: /home/ubuntu/bench_ALP + cmake_osx_architectures + cmake_host_system_processor: x86_64 + cmake_system_processor: x86_64 + cmake_host_system_name: Linux + cmake_system_name: Linux + cmake_c_compiler: /usr/bin/clang + cmake_cxx_compiler: /usr/bin/clang++ + cmake_cxx_compiler_id: Clang + cmake_cxx_compiler_version: 14.0.0 + cmake_crosscompiling: TRUE + cmake_cxx_flags_debug: -g + cmake_cxx_flags_release: -O3 -DNDEBUG -std=c++17 -stdlib=libstdc++ + cmake_build_type: Release + cmake_toolchain_file: i4i +target info: + target_name: fallback_scalar_aav_1024_uf1_falp + target_compile_options: -mavx512dq diff --git a/publication/results/i4i/fallback_scalar_nav_1024_uf1_falp.csv b/publication/results/i4i/fallback_scalar_nav_1024_uf1_falp.csv new file mode 100644 index 0000000..22a93c2 --- /dev/null +++ b/publication/results/i4i/fallback_scalar_nav_1024_uf1_falp.csv @@ -0,0 +1,63 @@ +benchmark_number,name,iterations,cycles_per_tuple +0,air_sensor_f_fused,3000000,2.22162 +0,air_sensor_f,3000000,2.8147 +1,arade4_fused,3000000,1.74807 +1,arade4,3000000,2.29135 +2,basel_temp_f_fused,3000000,1.92374 +2,basel_temp_f,3000000,2.45576 +3,basel_wind_f_fused,3000000,1.95872 +3,basel_wind_f,3000000,2.50499 +4,bird_migration_f_fused,3000000,1.86532 +4,bird_migration_f,3000000,2.39625 +5,bitcoin_f_fused,3000000,1.92638 +5,bitcoin_f,3000000,2.44426 +6,bitcoin_transactions_f_fused,3000000,1.95179 +6,bitcoin_transactions_f,3000000,2.48338 +7,city_temperature_f_fused,3000000,1.79951 +7,city_temperature_f,3000000,2.32968 +8,cms1_fused,3000000,2.09653 +8,cms1,3000000,2.67297 +9,cms9_fused,3000000,1.79013 +9,cms9,3000000,2.30271 +10,cms25_fused,3000000,2.0626 +10,cms25,3000000,2.71822 +11,food_prices_fused,3000000,1.93884 +11,food_prices,3000000,2.453 +12,gov10_fused,3000000,1.98815 +12,gov10,3000000,2.5329 +13,gov26_fused,3000000,0.832098 +13,gov26,3000000,2.22171 +14,gov30_fused,3000000,0.835231 +14,gov30,3000000,2.22656 +15,gov31_fused,3000000,0.833096 +15,gov31,3000000,2.22575 +16,gov40_fused,3000000,0.833688 +16,gov40,3000000,2.22636 +17,medicare1_fused,3000000,2.05258 +17,medicare1,3000000,2.70144 +18,medicare9_fused,3000000,1.79039 +18,medicare9,3000000,2.30375 +19,neon_air_pressure_fused,3000000,1.55111 +19,neon_air_pressure,3000000,2.23113 +20,neon_bio_temp_c_fused,3000000,1.78851 +20,neon_bio_temp_c,3000000,2.30258 +21,neon_dew_point_temp_fused,3000000,1.83616 +21,neon_dew_point_temp,3000000,2.35527 +22,neon_pm10_dust_fused,3000000,1.57373 +22,neon_pm10_dust,3000000,2.22504 +23,neon_wind_dir_fused,3000000,1.54877 +23,neon_wind_dir,3000000,2.2242 +24,nyc29_fused,3000000,2.06174 +24,nyc29,3000000,2.72308 +25,poi_lat_fused,3000000,2.31411 +25,poi_lat,3000000,2.88151 +26,poi_lon_fused,3000000,2.23298 +26,poi_lon,3000000,2.81231 +27,ssd_hdd_benchmarks_f_fused,3000000,1.86453 +27,ssd_hdd_benchmarks_f,3000000,2.39036 +28,stocks_de_fused,3000000,1.78986 +28,stocks_de,3000000,2.30407 +29,stocks_uk_fused,3000000,1.79728 +29,stocks_uk,3000000,2.32065 +30,stocks_usa_c_fused,3000000,1.79238 +30,stocks_usa_c,3000000,2.29452 diff --git a/publication/results/i4i/fallback_scalar_nav_1024_uf1_falp.metadata b/publication/results/i4i/fallback_scalar_nav_1024_uf1_falp.metadata new file mode 100644 index 0000000..e43e1be --- /dev/null +++ b/publication/results/i4i/fallback_scalar_nav_1024_uf1_falp.metadata @@ -0,0 +1,27 @@ +2023-04-01T19:04:38+00:00 +Run on (2 X 3506.46 MHz CPU s) +CPU Caches: + L1 Data 48 KiB (x1) + L1 Instruction 32 KiB (x1) + L2 Unified 1280 KiB (x1) + L3 Unified 55296 KiB (x1) +Load Average: 0.67, 0.57, 0.28 +cmake info: + source_dir: /home/ubuntu/bench_ALP + cmake_osx_architectures + cmake_host_system_processor: x86_64 + cmake_system_processor: x86_64 + cmake_host_system_name: Linux + cmake_system_name: Linux + cmake_c_compiler: /usr/bin/clang + cmake_cxx_compiler: /usr/bin/clang++ + cmake_cxx_compiler_id: Clang + cmake_cxx_compiler_version: 14.0.0 + cmake_crosscompiling: TRUE + cmake_cxx_flags_debug: -g + cmake_cxx_flags_release: -O3 -DNDEBUG -std=c++17 -stdlib=libstdc++ + cmake_build_type: Release + cmake_toolchain_file: i4i +target info: + target_name: fallback_scalar_nav_1024_uf1_falp + target_compile_options: -fno-builtin;-fno-slp-vectorize;-fno-vectorize diff --git a/publication/results/i4i/gorillas.csv b/publication/results/i4i/gorillas.csv new file mode 100644 index 0000000..f6c88bb --- /dev/null +++ b/publication/results/i4i/gorillas.csv @@ -0,0 +1,63 @@ +benchmark_number,name,iterations,cycles_per_tuple +0,air_sensor_f_encode,300000,35.188 +0,air_sensor_f_decode,300000,31.2283 +1,arade4_encode,300000,31.1906 +1,arade4_decode,300000,33.8147 +2,basel_temp_f_encode,300000,32.4079 +2,basel_temp_f_decode,300000,36.0569 +3,basel_wind_f_encode,300000,42.6261 +3,basel_wind_f_decode,300000,32.8241 +4,bird_migration_f_encode,300000,36.4161 +4,bird_migration_f_decode,300000,30.0803 +5,bitcoin_f_encode,300000,33.1939 +5,bitcoin_f_decode,300000,32.4374 +6,bitcoin_transactions_f_encode,300000,41.4405 +6,bitcoin_transactions_f_decode,300000,32.0752 +7,city_temperature_f_encode,300000,38.2826 +7,city_temperature_f_decode,300000,35.2029 +8,cms1_encode,300000,33.2101 +8,cms1_decode,300000,33.317 +9,cms9_encode,300000,22.4721 +9,cms9_decode,300000,18.8807 +10,cms25_encode,300000,44.7687 +10,cms25_decode,300000,33.8109 +11,food_prices_encode,300000,22.938 +11,food_prices_decode,300000,21.7171 +12,gov10_encode,300000,40.4318 +12,gov10_decode,300000,35.2551 +13,gov26_encode,300000,5.0006 +13,gov26_decode,300000,8.32189 +14,gov30_encode,300000,6.33516 +14,gov30_decode,300000,8.33427 +15,gov31_encode,300000,5.87809 +15,gov31_decode,300000,8.09264 +16,gov40_encode,300000,5.68241 +16,gov40_decode,300000,8.13033 +17,medicare1_encode,300000,31.6809 +17,medicare1_decode,300000,26.8933 +18,medicare9_encode,300000,22.6861 +18,medicare9_decode,300000,19.0597 +19,neon_air_pressure_encode,300000,33.1859 +19,neon_air_pressure_decode,300000,30.59 +20,neon_bio_temp_c_encode,300000,38.3452 +20,neon_bio_temp_c_decode,300000,31.3365 +21,neon_dew_point_temp_encode,300000,36.9523 +21,neon_dew_point_temp_decode,300000,30.9983 +22,neon_pm10_dust_encode,300000,24.5801 +22,neon_pm10_dust_decode,300000,23.7097 +23,neon_wind_dir_encode,300000,39.4908 +23,neon_wind_dir_decode,300000,31.2462 +24,nyc29_encode,300000,18.4144 +24,nyc29_decode,300000,19.2405 +25,poi_lat_encode,300000,38.6296 +25,poi_lat_decode,300000,36.0801 +26,poi_lon_encode,300000,37.9742 +26,poi_lon_decode,300000,36.0724 +27,ssd_hdd_benchmarks_f_encode,300000,32.4685 +27,ssd_hdd_benchmarks_f_decode,300000,30.7621 +28,stocks_de_encode,300000,34.1249 +28,stocks_de_decode,300000,30.9398 +29,stocks_uk_encode,300000,20.8347 +29,stocks_uk_decode,300000,17.4689 +30,stocks_usa_c_encode,300000,22.3902 +30,stocks_usa_c_decode,300000,22.0491 diff --git a/publication/results/i4i/gorillas.metadata b/publication/results/i4i/gorillas.metadata new file mode 100644 index 0000000..2898ae7 --- /dev/null +++ b/publication/results/i4i/gorillas.metadata @@ -0,0 +1,27 @@ +2023-04-11T20:13:40+00:00 +Run on (2 X 2899.97 MHz CPU s) +CPU Caches: + L1 Data 48 KiB (x1) + L1 Instruction 32 KiB (x1) + L2 Unified 1280 KiB (x1) + L3 Unified 55296 KiB (x1) +Load Average: 0.08, 0.02, 0.01 +cmake info: + source_dir: /home/ubuntu/bench_ALP + cmake_osx_architectures + cmake_host_system_processor: x86_64 + cmake_system_processor: x86_64 + cmake_host_system_name: Linux + cmake_system_name: Linux + cmake_c_compiler: /usr/bin/clang + cmake_cxx_compiler: /usr/bin/clang++ + cmake_cxx_compiler_id: Clang + cmake_cxx_compiler_version: 14.0.0 + cmake_crosscompiling: TRUE + cmake_cxx_flags_debug: -g + cmake_cxx_flags_release: -O3 -DNDEBUG + cmake_build_type: Release + cmake_toolchain_file: i4i +target info: + target_name: + target_compile_options: diff --git a/publication/results/i4i/patas.csv b/publication/results/i4i/patas.csv new file mode 100644 index 0000000..2b77030 --- /dev/null +++ b/publication/results/i4i/patas.csv @@ -0,0 +1,63 @@ +benchmark_number,name,iterations,cycles_per_tuple +0,air_sensor_f_encode,300000,16.9687 +0,air_sensor_f_decode,300000,8.96163 +1,arade4_encode,300000,16.2505 +1,arade4_decode,300000,13.6057 +2,basel_temp_f_encode,300000,16.3867 +2,basel_temp_f_decode,300000,6.69035 +3,basel_wind_f_encode,300000,16.1748 +3,basel_wind_f_decode,300000,6.17207 +4,bird_migration_f_encode,300000,16.4873 +4,bird_migration_f_decode,300000,5.64632 +5,bitcoin_f_encode,300000,16.2369 +5,bitcoin_f_decode,300000,9.60166 +6,bitcoin_transactions_f_encode,300000,16.1693 +6,bitcoin_transactions_f_decode,300000,6.84054 +7,city_temperature_f_encode,300000,18.2067 +7,city_temperature_f_decode,300000,4.59164 +8,cms1_encode,300000,16.0413 +8,cms1_decode,300000,9.98175 +9,cms9_encode,300000,17.2076 +9,cms9_decode,300000,6.11353 +10,cms25_encode,300000,17.7927 +10,cms25_decode,300000,6.41941 +11,food_prices_encode,300000,16.6694 +11,food_prices_decode,300000,5.7952 +12,gov10_encode,300000,17.0174 +12,gov10_decode,300000,8.73896 +13,gov26_encode,300000,16.0542 +13,gov26_decode,300000,5.74857 +14,gov30_encode,300000,16.3597 +14,gov30_decode,300000,6.15269 +15,gov31_encode,300000,16.1083 +15,gov31_decode,300000,5.80164 +16,gov40_encode,300000,16.2253 +16,gov40_decode,300000,5.7705 +17,medicare1_encode,300000,16.3308 +17,medicare1_decode,300000,6.11785 +18,medicare9_encode,300000,17.6877 +18,medicare9_decode,300000,6.30433 +19,neon_air_pressure_encode,300000,15.9799 +19,neon_air_pressure_decode,300000,5.2089 +20,neon_bio_temp_c_encode,300000,16.6634 +20,neon_bio_temp_c_decode,300000,5.06109 +21,neon_dew_point_temp_encode,300000,16.4384 +21,neon_dew_point_temp_decode,300000,8.04556 +22,neon_pm10_dust_encode,300000,17.4099 +22,neon_pm10_dust_decode,300000,5.64414 +23,neon_wind_dir_encode,300000,16.6258 +23,neon_wind_dir_decode,300000,11.2319 +24,nyc29_encode,300000,16.5384 +24,nyc29_decode,300000,5.67799 +25,poi_lat_encode,300000,16.8956 +25,poi_lat_decode,300000,6.49472 +26,poi_lon_encode,300000,20.2203 +26,poi_lon_decode,300000,6.06563 +27,ssd_hdd_benchmarks_f_encode,300000,16.5886 +27,ssd_hdd_benchmarks_f_decode,300000,6.01322 +28,stocks_de_encode,300000,16.8332 +28,stocks_de_decode,300000,5.73046 +29,stocks_uk_encode,300000,17.3595 +29,stocks_uk_decode,300000,5.88792 +30,stocks_usa_c_encode,300000,17.18 +30,stocks_usa_c_decode,300000,5.91911 diff --git a/publication/results/i4i/patas.metadata b/publication/results/i4i/patas.metadata new file mode 100644 index 0000000..d68b8a1 --- /dev/null +++ b/publication/results/i4i/patas.metadata @@ -0,0 +1,27 @@ +2023-04-01T21:01:16+00:00 +Run on (2 X 3504.1 MHz CPU s) +CPU Caches: + L1 Data 48 KiB (x1) + L1 Instruction 32 KiB (x1) + L2 Unified 1280 KiB (x1) + L3 Unified 55296 KiB (x1) +Load Average: 0.14, 0.31, 0.18 +cmake info: + source_dir: /home/ubuntu/bench_ALP + cmake_osx_architectures + cmake_host_system_processor: x86_64 + cmake_system_processor: x86_64 + cmake_host_system_name: Linux + cmake_system_name: Linux + cmake_c_compiler: /usr/bin/clang + cmake_cxx_compiler: /usr/bin/clang++ + cmake_cxx_compiler_id: Clang + cmake_cxx_compiler_version: 14.0.0 + cmake_crosscompiling: TRUE + cmake_cxx_flags_debug: -g + cmake_cxx_flags_release: -O3 -DNDEBUG + cmake_build_type: Release + cmake_toolchain_file: i4i +target info: + target_name: + target_compile_options: diff --git a/publication/results/i4i/ped.csv b/publication/results/i4i/ped.csv new file mode 100644 index 0000000..7cf257f --- /dev/null +++ b/publication/results/i4i/ped.csv @@ -0,0 +1,63 @@ +benchmark_number,name,iterations,cycles_per_tuple +8912,air_sensor_f_encode,300000,705.049 +0,air_sensor_f_decode,300000,2.5817 +4409,arade4_encode,300000,1148.9 +1,arade4_decode,300000,2.13847 +5452,basel_temp_f_encode,300000,1106.64 +2,basel_temp_f_decode,300000,4.0496 +4637,basel_wind_f_encode,300000,888.185 +3,basel_wind_f_decode,300000,3.1627 +3493,bird_migration_f_encode,300000,707.648 +4,bird_migration_f_decode,300000,2.37755 +5512,bitcoin_f_encode,300000,1288.52 +5,bitcoin_f_decode,300000,5.97488 +5241,bitcoin_transactions_f_encode,300000,1210.7 +6,bitcoin_transactions_f_decode,300000,4.74716 +3769,city_temperature_f_encode,300000,829.258 +7,city_temperature_f_decode,300000,3.41249 +7132,cms1_encode,300000,1197.92 +8,cms1_decode,300000,6.86933 +1125,cms9_encode,300000,529.308 +9,cms9_decode,300000,2.20908 +8684,cms25_encode,300000,1057.53 +10,cms25_decode,300000,2.34547 +2917,food_prices_encode,300000,790.364 +11,food_prices_decode,300000,3.2545 +4873,gov10_encode,300000,926.034 +12,gov10_decode,300000,5.23146 +113,gov26_encode,300000,175.191 +13,gov26_decode,300000,1.61329 +1631,gov30_encode,300000,266.209 +14,gov30_decode,300000,2.93666 +269,gov31_encode,300000,177.039 +15,gov31_decode,300000,2.14516 +189,gov40_encode,300000,179.643 +16,gov40_decode,300000,1.61722 +5040,medicare1_encode,300000,890.624 +17,medicare1_decode,300000,5.79265 +1197,medicare9_encode,300000,496.417 +18,medicare9_decode,300000,1.99021 +3953,neon_air_pressure_encode,300000,1070.95 +19,neon_air_pressure_decode,300000,2.51748 +3457,neon_bio_temp_c_encode,300000,822.386 +20,neon_bio_temp_c_decode,300000,2.36109 +3697,neon_dew_point_temp_encode,300000,1080.71 +21,neon_dew_point_temp_decode,300000,2.41566 +2153,neon_pm10_dust_encode,300000,388.765 +22,neon_pm10_dust_decode,300000,2.27119 +4085,neon_wind_dir_encode,300000,1101.6 +23,neon_wind_dir_decode,300000,2.35273 +8912,nyc29_encode,300000,361.441 +24,nyc29_decode,300000,2.16311 +8912,poi_lat_encode,300000,726.743 +25,poi_lat_decode,300000,2.16316 +8912,poi_lon_encode,300000,700.67 +26,poi_lon_decode,300000,2.16802 +4185,ssd_hdd_benchmarks_f_encode,300000,393.582 +27,ssd_hdd_benchmarks_f_decode,300000,2.25994 +3525,stocks_de_encode,300000,640.477 +28,stocks_de_decode,300000,2.35811 +2161,stocks_uk_encode,300000,432.259 +29,stocks_uk_decode,300000,2.14711 +3193,stocks_usa_c_encode,300000,377.442 +30,stocks_usa_c_decode,300000,2.40222 diff --git a/publication/results/i4i/ped.metadata b/publication/results/i4i/ped.metadata new file mode 100644 index 0000000..f306861 --- /dev/null +++ b/publication/results/i4i/ped.metadata @@ -0,0 +1,27 @@ +2023-04-05T16:14:41+00:00 +Run on (2 X 2899.98 MHz CPU s) +CPU Caches: + L1 Data 48 KiB (x1) + L1 Instruction 32 KiB (x1) + L2 Unified 1280 KiB (x1) + L3 Unified 55296 KiB (x1) +Load Average: 0.04, 0.01, 0.11 +cmake info: + source_dir: /home/ubuntu/analyze + cmake_osx_architectures + cmake_host_system_processor: x86_64 + cmake_system_processor: x86_64 + cmake_host_system_name: Linux + cmake_system_name: Linux + cmake_c_compiler: /usr/bin/clang + cmake_cxx_compiler: /usr/bin/clang++ + cmake_cxx_compiler_id: Clang + cmake_cxx_compiler_version: 14.0.0 + cmake_crosscompiling: TRUE + cmake_cxx_flags_debug: -g + cmake_cxx_flags_release: -O3 -DNDEBUG -g + cmake_build_type: Release + cmake_toolchain_file: i4i +target info: + target_name: + target_compile_options: diff --git a/publication/results/i4i/x86_64_avx512bw_intrinsic_1024_uf1_falp.csv b/publication/results/i4i/x86_64_avx512bw_intrinsic_1024_uf1_falp.csv new file mode 100644 index 0000000..70f77b5 --- /dev/null +++ b/publication/results/i4i/x86_64_avx512bw_intrinsic_1024_uf1_falp.csv @@ -0,0 +1,193 @@ +benchmark_number,name,iterations,cycles_per_tuple +0,air_sensor_f_fused,3000000,0.735534 +0,air_sensor_f,3000000,0.949223 +1,arade4_fused,3000000,0.570426 +1,arade4,3000000,0.743116 +2,basel_temp_f_fused,3000000,0.647168 +2,basel_temp_f,3000000,0.862806 +3,basel_wind_f_fused,3000000,0.622699 +3,basel_wind_f,3000000,0.837296 +4,bird_migration_f_fused,3000000,0.587906 +4,bird_migration_f,3000000,0.806356 +5,bitcoin_f_fused,3000000,0.611326 +5,bitcoin_f,3000000,0.827017 +6,bitcoin_transactions_f_fused,3000000,0.620234 +6,bitcoin_transactions_f,3000000,0.836727 +7,city_temperature_f_fused,3000000,0.565403 +7,city_temperature_f,3000000,0.785559 +8,cms1_fused,3000000,0.668048 +8,cms1,3000000,0.879552 +9,cms9_fused,3000000,0.563085 +9,cms9,3000000,0.778188 +10,cms25_fused,3000000,0.645084 +10,cms25,3000000,0.868406 +11,food_prices_fused,3000000,0.628683 +11,food_prices,3000000,0.846963 +12,gov10_fused,3000000,0.664927 +12,gov10,3000000,0.87745 +13,gov26_fused,3000000,0.107984 +13,gov26,3000000,0.737592 +14,gov30_fused,3000000,0.11277 +14,gov30,3000000,0.741204 +15,gov31_fused,3000000,0.108781 +15,gov31,3000000,0.739146 +16,gov40_fused,3000000,0.110798 +16,gov40,3000000,0.740706 +17,medicare1_fused,3000000,0.667511 +17,medicare1,3000000,0.88813 +18,medicare9_fused,3000000,0.563542 +18,medicare9,3000000,0.779709 +19,neon_air_pressure_fused,3000000,0.51365 +19,neon_air_pressure,3000000,0.739682 +20,neon_bio_temp_c_fused,3000000,0.558796 +20,neon_bio_temp_c,3000000,0.780495 +21,neon_dew_point_temp_fused,3000000,0.583338 +21,neon_dew_point_temp,3000000,0.795917 +22,neon_pm10_dust_fused,3000000,0.525261 +22,neon_pm10_dust,3000000,0.738134 +23,neon_wind_dir_fused,3000000,0.509224 +23,neon_wind_dir,3000000,0.744552 +24,nyc29_fused,3000000,0.642502 +24,nyc29,3000000,0.866106 +25,poi_lat_fused,3000000,0.910669 +25,poi_lat,3000000,1.06706 +26,poi_lon_fused,3000000,0.882752 +26,poi_lon,3000000,0.977373 +27,ssd_hdd_benchmarks_f_fused,3000000,0.58463 +27,ssd_hdd_benchmarks_f,3000000,0.803545 +28,stocks_de_fused,3000000,0.56469 +28,stocks_de,3000000,0.778704 +29,stocks_uk_fused,3000000,0.559863 +29,stocks_uk,3000000,0.781873 +30,stocks_usa_c_fused,3000000,0.554783 +30,stocks_usa_c,3000000,0.776821 +0,bw0_fused,3000000,0.107958 +0,bw0,3000000,0.737736 +1,bw1_fused,3000000,0.539962 +1,bw1,3000000,0.75953 +2,bw2_fused,3000000,0.538759 +2,bw2,3000000,0.758219 +3,bw3_fused,3000000,0.543731 +3,bw3,3000000,0.763616 +4,bw4_fused,3000000,0.533322 +4,bw4,3000000,0.753623 +5,bw5_fused,3000000,0.550153 +5,bw5,3000000,0.77055 +6,bw6_fused,3000000,0.548399 +6,bw6,3000000,0.76854 +7,bw7_fused,3000000,0.554803 +7,bw7,3000000,0.775394 +8,bw8_fused,3000000,0.525083 +8,bw8,3000000,0.739804 +9,bw9_fused,3000000,0.560096 +9,bw9,3000000,0.779258 +10,bw10_fused,3000000,0.55883 +10,bw10,3000000,0.77845 +11,bw11_fused,3000000,0.565418 +11,bw11,3000000,0.785551 +12,bw12_fused,3000000,0.553662 +12,bw12,3000000,0.772567 +13,bw13_fused,3000000,0.570965 +13,bw13,3000000,0.790845 +14,bw14_fused,3000000,0.56818 +14,bw14,3000000,0.787241 +15,bw15_fused,3000000,0.576233 +15,bw15,3000000,0.797331 +16,bw16_fused,3000000,0.509329 +16,bw16,3000000,0.737063 +17,bw17_fused,3000000,0.584713 +17,bw17,3000000,0.802172 +18,bw18_fused,3000000,0.578571 +18,bw18,3000000,0.796606 +19,bw19_fused,3000000,0.587173 +19,bw19,3000000,0.805627 +20,bw20_fused,3000000,0.573106 +20,bw20,3000000,0.790228 +21,bw21_fused,3000000,0.59353 +21,bw21,3000000,0.810349 +22,bw22_fused,3000000,0.588067 +22,bw22,3000000,0.807559 +23,bw23_fused,3000000,0.598118 +23,bw23,3000000,0.815921 +24,bw24_fused,3000000,0.564551 +24,bw24,3000000,0.737803 +25,bw25_fused,3000000,0.603458 +25,bw25,3000000,0.823264 +26,bw26_fused,3000000,0.597355 +26,bw26,3000000,0.818863 +27,bw27_fused,3000000,0.607934 +27,bw27,3000000,0.828896 +28,bw28_fused,3000000,0.592377 +28,bw28,3000000,0.813557 +29,bw29_fused,3000000,0.614177 +29,bw29,3000000,0.834118 +30,bw30_fused,3000000,0.607425 +30,bw30,3000000,0.82949 +31,bw31_fused,3000000,0.619439 +31,bw31,3000000,0.837564 +32,bw32_fused,3000000,0.473831 +32,bw32,3000000,0.738802 +33,bw33_fused,3000000,0.624525 +33,bw33,3000000,0.844967 +34,bw34_fused,3000000,0.618046 +34,bw34,3000000,0.839185 +35,bw35_fused,3000000,0.629518 +35,bw35,3000000,0.849643 +36,bw36_fused,3000000,0.611299 +36,bw36,3000000,0.830402 +37,bw37_fused,3000000,0.640581 +37,bw37,3000000,0.854821 +38,bw38_fused,3000000,0.628652 +38,bw38,3000000,0.848448 +39,bw39_fused,3000000,0.652499 +39,bw39,3000000,0.867998 +40,bw40_fused,3000000,0.601676 +40,bw40,3000000,0.758284 +41,bw41_fused,3000000,0.650563 +41,bw41,3000000,0.866763 +42,bw42_fused,3000000,0.638033 +42,bw42,3000000,0.859149 +43,bw43_fused,3000000,0.656157 +43,bw43,3000000,0.880554 +44,bw44_fused,3000000,0.630222 +44,bw44,3000000,0.852117 +45,bw45_fused,3000000,0.656371 +45,bw45,3000000,0.877043 +46,bw46_fused,3000000,0.647234 +46,bw46,3000000,0.869758 +47,bw47_fused,3000000,0.663678 +47,bw47,3000000,0.885626 +48,bw48_fused,3000000,0.587442 +48,bw48,3000000,0.759221 +49,bw49_fused,3000000,0.669696 +49,bw49,3000000,0.891219 +50,bw50_fused,3000000,0.656709 +50,bw50,3000000,0.876606 +51,bw51_fused,3000000,0.692988 +51,bw51,3000000,0.900645 +52,bw52_fused,3000000,0.649145 +52,bw52,3000000,0.872187 +53,bw53_fused,3000000,0.706483 +53,bw53,3000000,0.914971 +54,bw54_fused,3000000,0.686286 +54,bw54,3000000,0.890756 +55,bw55_fused,3000000,0.717181 +55,bw55,3000000,0.921688 +56,bw56_fused,3000000,0.640511 +56,bw56,3000000,0.793344 +57,bw57_fused,3000000,0.715165 +57,bw57,3000000,0.919173 +58,bw58_fused,3000000,0.6969 +58,bw58,3000000,0.923633 +59,bw59_fused,3000000,0.700572 +59,bw59,3000000,0.925202 +60,bw60_fused,3000000,0.668111 +60,bw60,3000000,0.891923 +61,bw61_fused,3000000,0.717039 +61,bw61,3000000,0.930145 +62,bw62_fused,3000000,0.697684 +62,bw62,3000000,0.924337 +63,bw63_fused,3000000,1.13278 +63,bw63,3000000,1.62142 +64,bw64_fused,3000000,1.1319 +64,bw64,3000000,1.62149 diff --git a/publication/results/i4i/x86_64_avx512bw_intrinsic_1024_uf1_falp.metadata b/publication/results/i4i/x86_64_avx512bw_intrinsic_1024_uf1_falp.metadata new file mode 100644 index 0000000..82ac5fb --- /dev/null +++ b/publication/results/i4i/x86_64_avx512bw_intrinsic_1024_uf1_falp.metadata @@ -0,0 +1,27 @@ +2023-04-08T16:54:25+00:00 +Run on (2 X 3499.48 MHz CPU s) +CPU Caches: + L1 Data 48 KiB (x1) + L1 Instruction 32 KiB (x1) + L2 Unified 1280 KiB (x1) + L3 Unified 55296 KiB (x1) +Load Average: 0.00, 0.01, 0.08 +cmake info: + source_dir: /home/ubuntu/bench_ALP + cmake_osx_architectures + cmake_host_system_processor: x86_64 + cmake_system_processor: x86_64 + cmake_host_system_name: Linux + cmake_system_name: Linux + cmake_c_compiler: /usr/bin/clang + cmake_cxx_compiler: /usr/bin/clang++ + cmake_cxx_compiler_id: Clang + cmake_cxx_compiler_version: 14.0.0 + cmake_crosscompiling: TRUE + cmake_cxx_flags_debug: -g + cmake_cxx_flags_release: -O3 -DNDEBUG + cmake_build_type: Release + cmake_toolchain_file: i4i +target info: + target_name: x86_64_avx512bw_intrinsic_1024_uf1_falp + target_compile_options: -fPIC;-mavx512dq diff --git a/publication/results/i4i_4xlarge/README.md b/publication/results/i4i_4xlarge/README.md new file mode 100644 index 0000000..6be420f --- /dev/null +++ b/publication/results/i4i_4xlarge/README.md @@ -0,0 +1,8 @@ +# Info +[i4i](https://aws.amazon.com/ec2/instance-types/i4i/): +- I4i instances are powered by the latest generation Intel Xeon Scalable (Ice Lake) Processors with an all-core turbo frequency of 3.5 GHz. + + +--- +## History +1. [link](https://www.cpubenchmark.net/cpu.php?cpu=Intel+Xeon+Platinum+8375C+%40+2.90GHz&id=4486) \ No newline at end of file diff --git a/publication/results/i4i_4xlarge/alp_decode_cutter.csv b/publication/results/i4i_4xlarge/alp_decode_cutter.csv new file mode 100644 index 0000000..301aaca --- /dev/null +++ b/publication/results/i4i_4xlarge/alp_decode_cutter.csv @@ -0,0 +1,4 @@ +benchmark_number,name,iterations,cycles_per_tuple +0,air_sensor_f_encode,300000,1.68266 +25,poi_lat_encode,300000,1.74618 +26,poi_lon_encode,300000,1.70154 diff --git a/publication/results/i4i_4xlarge/alp_decode_cutter.metadata b/publication/results/i4i_4xlarge/alp_decode_cutter.metadata new file mode 100644 index 0000000..f6c75ae --- /dev/null +++ b/publication/results/i4i_4xlarge/alp_decode_cutter.metadata @@ -0,0 +1,27 @@ +2023-04-11T22:38:08+00:00 +Run on (16 X 2900 MHz CPU s) +CPU Caches: + L1 Data 48 KiB (x8) + L1 Instruction 32 KiB (x8) + L2 Unified 1280 KiB (x8) + L3 Unified 55296 KiB (x1) +Load Average: 0.14, 0.13, 0.05 +cmake info: + source_dir: /home/ubuntu/bench_ALP + cmake_osx_architectures + cmake_host_system_processor: x86_64 + cmake_system_processor: x86_64 + cmake_host_system_name: Linux + cmake_system_name: Linux + cmake_c_compiler: /usr/bin/clang + cmake_cxx_compiler: /usr/bin/clang++ + cmake_cxx_compiler_id: Clang + cmake_cxx_compiler_version: 14.0.0 + cmake_crosscompiling: TRUE + cmake_cxx_flags_debug: -g + cmake_cxx_flags_release: -O3 -DNDEBUG + cmake_build_type: Release + cmake_toolchain_file: i4i_4xlarge +target info: + target_name: + target_compile_options: diff --git a/publication/results/i4i_4xlarge/alp_encode.csv b/publication/results/i4i_4xlarge/alp_encode.csv new file mode 100644 index 0000000..1ec60e7 --- /dev/null +++ b/publication/results/i4i_4xlarge/alp_encode.csv @@ -0,0 +1,29 @@ +benchmark_number,name,iterations,cycles_per_tuple +1,arade4_encode_simdized,300000,2.38023 +2,basel_temp_f_encode_simdized,300000,2.53778 +3,basel_wind_f_encode_simdized,300000,2.51468 +4,bird_migration_f_encode_simdized,300000,2.40356 +5,bitcoin_f_encode_simdized,300000,2.47314 +6,bitcoin_transactions_f_encode_simdized,300000,2.43465 +7,city_temperature_f_encode_simdized,300000,2.378 +8,cms1_encode_simdized,300000,2.53815 +9,cms9_encode_simdized,300000,2.33577 +10,cms25_encode_simdized,300000,2.44143 +11,food_prices_encode_simdized,300000,2.51977 +12,gov10_encode_simdized,300000,2.64408 +13,gov26_encode_simdized,300000,1.91788 +14,gov30_encode_simdized,300000,1.92753 +15,gov31_encode_simdized,300000,1.91077 +16,gov40_encode_simdized,300000,1.91398 +17,medicare1_encode_simdized,300000,2.56369 +18,medicare9_encode_simdized,300000,2.34318 +19,neon_air_pressure_encode_simdized,300000,2.32268 +20,neon_bio_temp_c_encode_simdized,300000,2.33313 +21,neon_dew_point_temp_encode_simdized,300000,2.40116 +22,neon_pm10_dust_encode_simdized,300000,2.24041 +23,neon_wind_dir_encode_simdized,300000,2.29825 +24,nyc29_encode_simdized,300000,2.43021 +27,ssd_hdd_benchmarks_f_encode_simdized,300000,2.40469 +28,stocks_de_encode_simdized,300000,2.33342 +29,stocks_uk_encode_simdized,300000,2.37033 +30,stocks_usa_c_encode_simdized,300000,2.37109 diff --git a/publication/results/i4i_4xlarge/alp_encode.metadata b/publication/results/i4i_4xlarge/alp_encode.metadata new file mode 100644 index 0000000..5c5cf2c --- /dev/null +++ b/publication/results/i4i_4xlarge/alp_encode.metadata @@ -0,0 +1,27 @@ +2023-04-11T22:48:45+00:00 +Run on (16 X 2900 MHz CPU s) +CPU Caches: + L1 Data 48 KiB (x8) + L1 Instruction 32 KiB (x8) + L2 Unified 1280 KiB (x8) + L3 Unified 55296 KiB (x1) +Load Average: 0.17, 0.08, 0.03 +cmake info: + source_dir: /home/ubuntu/bench_ALP + cmake_osx_architectures + cmake_host_system_processor: x86_64 + cmake_system_processor: x86_64 + cmake_host_system_name: Linux + cmake_system_name: Linux + cmake_c_compiler: /usr/bin/clang + cmake_cxx_compiler: /usr/bin/clang++ + cmake_cxx_compiler_id: Clang + cmake_cxx_compiler_version: 14.0.0 + cmake_crosscompiling: TRUE + cmake_cxx_flags_debug: -g + cmake_cxx_flags_release: -O3 -DNDEBUG + cmake_build_type: Release + cmake_toolchain_file: i4i_4xlarge +target info: + target_name: + target_compile_options: diff --git a/publication/results/i4i_4xlarge/alp_encode_pde.csv b/publication/results/i4i_4xlarge/alp_encode_pde.csv new file mode 100644 index 0000000..0f6ef9a --- /dev/null +++ b/publication/results/i4i_4xlarge/alp_encode_pde.csv @@ -0,0 +1,29 @@ +benchmark_number,name,iterations,cycles_per_tuple +1,arade4_encode,300000,2.59007 +2,basel_temp_f_encode,300000,2.74668 +3,basel_wind_f_encode,300000,2.70031 +4,bird_migration_f_encode,300000,2.60336 +5,bitcoin_f_encode,300000,2.67054 +6,bitcoin_transactions_f_encode,300000,2.62551 +7,city_temperature_f_encode,300000,2.53987 +8,cms1_encode,300000,2.7336 +9,cms9_encode,300000,2.53942 +10,cms25_encode,300000,2.64591 +11,food_prices_encode,300000,2.71783 +12,gov10_encode,300000,2.94825 +13,gov26_encode,300000,2.07008 +14,gov30_encode,300000,2.13138 +15,gov31_encode,300000,2.11546 +16,gov40_encode,300000,2.12282 +17,medicare1_encode,300000,2.79457 +18,medicare9_encode,300000,2.53497 +19,neon_air_pressure_encode,300000,2.48776 +20,neon_bio_temp_c_encode,300000,2.47906 +21,neon_dew_point_temp_encode,300000,2.60257 +22,neon_pm10_dust_encode,300000,2.38602 +23,neon_wind_dir_encode,300000,2.42716 +24,nyc29_encode,300000,2.62916 +27,ssd_hdd_benchmarks_f_encode,300000,2.55324 +28,stocks_de_encode,300000,2.53258 +29,stocks_uk_encode,300000,2.5257 +30,stocks_usa_c_encode,300000,2.52238 diff --git a/publication/results/i4i_4xlarge/alp_encode_pde.metadata b/publication/results/i4i_4xlarge/alp_encode_pde.metadata new file mode 100644 index 0000000..122f964 --- /dev/null +++ b/publication/results/i4i_4xlarge/alp_encode_pde.metadata @@ -0,0 +1,27 @@ +2023-04-11T22:35:58+00:00 +Run on (16 X 2900 MHz CPU s) +CPU Caches: + L1 Data 48 KiB (x8) + L1 Instruction 32 KiB (x8) + L2 Unified 1280 KiB (x8) + L3 Unified 55296 KiB (x1) +Load Average: 0.43, 0.15, 0.04 +cmake info: + source_dir: /home/ubuntu/bench_ALP + cmake_osx_architectures + cmake_host_system_processor: x86_64 + cmake_system_processor: x86_64 + cmake_host_system_name: Linux + cmake_system_name: Linux + cmake_c_compiler: /usr/bin/clang + cmake_cxx_compiler: /usr/bin/clang++ + cmake_cxx_compiler_id: Clang + cmake_cxx_compiler_version: 14.0.0 + cmake_crosscompiling: TRUE + cmake_cxx_flags_debug: -g + cmake_cxx_flags_release: -O3 -DNDEBUG + cmake_build_type: Release + cmake_toolchain_file: i4i_4xlarge +target info: + target_name: + target_compile_options: diff --git a/publication/results/i4i_4xlarge/alp_encode_without_sampling.csv b/publication/results/i4i_4xlarge/alp_encode_without_sampling.csv new file mode 100644 index 0000000..d5b9352 --- /dev/null +++ b/publication/results/i4i_4xlarge/alp_encode_without_sampling.csv @@ -0,0 +1,29 @@ +benchmark_number,name,iterations,cycles_per_tuple +1,arade4_encode_simdized,300000,2.36373 +2,basel_temp_f_encode_simdized,300000,2.52222 +3,basel_wind_f_encode_simdized,300000,2.48037 +4,bird_migration_f_encode_simdized,300000,2.38272 +5,bitcoin_f_encode_simdized,300000,2.45961 +6,bitcoin_transactions_f_encode_simdized,300000,2.42366 +7,city_temperature_f_encode_simdized,300000,2.37147 +8,cms1_encode_simdized,300000,2.53175 +9,cms9_encode_simdized,300000,2.31282 +10,cms25_encode_simdized,300000,2.4341 +11,food_prices_encode_simdized,300000,2.50929 +12,gov10_encode_simdized,300000,2.61827 +13,gov26_encode_simdized,300000,1.89847 +14,gov30_encode_simdized,300000,1.90992 +15,gov31_encode_simdized,300000,1.8904 +16,gov40_encode_simdized,300000,1.89314 +17,medicare1_encode_simdized,300000,2.5701 +18,medicare9_encode_simdized,300000,2.33099 +19,neon_air_pressure_encode_simdized,300000,2.29841 +20,neon_bio_temp_c_encode_simdized,300000,2.3151 +21,neon_dew_point_temp_encode_simdized,300000,2.38419 +22,neon_pm10_dust_encode_simdized,300000,2.22077 +23,neon_wind_dir_encode_simdized,300000,2.2714 +24,nyc29_encode_simdized,300000,2.41509 +27,ssd_hdd_benchmarks_f_encode_simdized,300000,2.38424 +28,stocks_de_encode_simdized,300000,2.31495 +29,stocks_uk_encode_simdized,300000,2.35838 +30,stocks_usa_c_encode_simdized,300000,2.35842 diff --git a/publication/results/i4i_4xlarge/alp_encode_without_sampling.metadata b/publication/results/i4i_4xlarge/alp_encode_without_sampling.metadata new file mode 100644 index 0000000..23af079 --- /dev/null +++ b/publication/results/i4i_4xlarge/alp_encode_without_sampling.metadata @@ -0,0 +1,27 @@ +2023-04-11T22:49:32+00:00 +Run on (16 X 2900 MHz CPU s) +CPU Caches: + L1 Data 48 KiB (x8) + L1 Instruction 32 KiB (x8) + L2 Unified 1280 KiB (x8) + L3 Unified 55296 KiB (x1) +Load Average: 0.15, 0.09, 0.03 +cmake info: + source_dir: /home/ubuntu/bench_ALP + cmake_osx_architectures + cmake_host_system_processor: x86_64 + cmake_system_processor: x86_64 + cmake_host_system_name: Linux + cmake_system_name: Linux + cmake_c_compiler: /usr/bin/clang + cmake_cxx_compiler: /usr/bin/clang++ + cmake_cxx_compiler_id: Clang + cmake_cxx_compiler_version: 14.0.0 + cmake_crosscompiling: TRUE + cmake_cxx_flags_debug: -g + cmake_cxx_flags_release: -O3 -DNDEBUG + cmake_build_type: Release + cmake_toolchain_file: i4i_4xlarge +target info: + target_name: + target_compile_options: diff --git a/publication/results/i4i_4xlarge/x86_64_avx512bw_intrinsic_1024_uf1_falp.csv b/publication/results/i4i_4xlarge/x86_64_avx512bw_intrinsic_1024_uf1_falp.csv new file mode 100644 index 0000000..b6499f8 --- /dev/null +++ b/publication/results/i4i_4xlarge/x86_64_avx512bw_intrinsic_1024_uf1_falp.csv @@ -0,0 +1,63 @@ +benchmark_number,name,iterations,cycles_per_tuple +0,air_sensor_f_fused,3000000,0.793257 +0,air_sensor_f,3000000,1.33114 +1,arade4_fused,3000000,0.569469 +1,arade4,3000000,0.978004 +2,basel_temp_f_fused,3000000,0.647115 +2,basel_temp_f,3000000,1.10038 +3,basel_wind_f_fused,3000000,0.635952 +3,basel_wind_f,3000000,1.09618 +4,bird_migration_f_fused,3000000,0.585521 +4,bird_migration_f,3000000,1.02182 +5,bitcoin_f_fused,3000000,0.612594 +5,bitcoin_f,3000000,1.07196 +6,bitcoin_transactions_f_fused,3000000,0.616801 +6,bitcoin_transactions_f,3000000,1.08706 +7,city_temperature_f_fused,3000000,0.565435 +7,city_temperature_f,3000000,0.989905 +8,cms1_fused,3000000,0.666972 +8,cms1,3000000,1.17086 +9,cms9_fused,3000000,0.563108 +9,cms9,3000000,0.97639 +10,cms25_fused,3000000,0.640577 +10,cms25,3000000,1.15707 +11,food_prices_fused,3000000,0.624783 +11,food_prices,3000000,1.08129 +12,gov10_fused,3000000,0.676186 +12,gov10,3000000,1.13762 +13,gov26_fused,3000000,0.108055 +13,gov26,3000000,0.833727 +14,gov30_fused,3000000,0.111435 +14,gov30,3000000,0.836312 +15,gov31_fused,3000000,0.108908 +15,gov31,3000000,0.836335 +16,gov40_fused,3000000,0.109792 +16,gov40,3000000,0.838302 +17,medicare1_fused,3000000,0.665111 +17,medicare1,3000000,1.17772 +18,medicare9_fused,3000000,0.567326 +18,medicare9,3000000,0.976106 +19,neon_air_pressure_fused,3000000,0.513865 +19,neon_air_pressure,3000000,0.867196 +20,neon_bio_temp_c_fused,3000000,0.558718 +20,neon_bio_temp_c,3000000,0.97441 +21,neon_dew_point_temp_fused,3000000,0.579757 +21,neon_dew_point_temp,3000000,0.999814 +22,neon_pm10_dust_fused,3000000,0.525327 +22,neon_pm10_dust,3000000,0.883437 +23,neon_wind_dir_fused,3000000,0.509356 +23,neon_wind_dir,3000000,0.86718 +24,nyc29_fused,3000000,0.640688 +24,nyc29,3000000,1.15975 +25,poi_lat_fused,3000000,0.930615 +25,poi_lat,3000000,1.46123 +26,poi_lon_fused,3000000,0.882995 +26,poi_lon,3000000,1.40789 +27,ssd_hdd_benchmarks_f_fused,3000000,0.581742 +27,ssd_hdd_benchmarks_f,3000000,1.02073 +28,stocks_de_fused,3000000,0.563002 +28,stocks_de,3000000,0.976846 +29,stocks_uk_fused,3000000,0.559865 +29,stocks_uk,3000000,0.977135 +30,stocks_usa_c_fused,3000000,0.554831 +30,stocks_usa_c,3000000,0.970758 diff --git a/publication/results/i4i_4xlarge/x86_64_avx512bw_intrinsic_1024_uf1_falp.metadata b/publication/results/i4i_4xlarge/x86_64_avx512bw_intrinsic_1024_uf1_falp.metadata new file mode 100644 index 0000000..bd5eb76 --- /dev/null +++ b/publication/results/i4i_4xlarge/x86_64_avx512bw_intrinsic_1024_uf1_falp.metadata @@ -0,0 +1,27 @@ +2023-03-30T15:57:18+00:00 +Run on (16 X 2899.98 MHz CPU s) +CPU Caches: + L1 Data 48 KiB (x8) + L1 Instruction 32 KiB (x8) + L2 Unified 1280 KiB (x8) + L3 Unified 55296 KiB (x1) +Load Average: 0.31, 0.08, 0.02 +cmake info: + source_dir: /home/ubuntu/bench_ALP + cmake_osx_architectures + cmake_host_system_processor: x86_64 + cmake_system_processor: x86_64 + cmake_host_system_name: Linux + cmake_system_name: Linux + cmake_c_compiler: /usr/bin/clang + cmake_cxx_compiler: /usr/bin/clang++ + cmake_cxx_compiler_id: Clang + cmake_cxx_compiler_version: 14.0.0 + cmake_crosscompiling: TRUE + cmake_cxx_flags_debug: -g + cmake_cxx_flags_release: -O3 -DNDEBUG -std=c++17 -stdlib=libstdc++ + cmake_build_type: Release + cmake_toolchain_file: i4i_4xlarge +target info: + target_name: x86_64_avx512bw_intrinsic_1024_uf1_falp + target_compile_options: -mavx512dq diff --git a/publication/results/m1/README.md b/publication/results/m1/README.md new file mode 100644 index 0000000..a7e9fc8 --- /dev/null +++ b/publication/results/m1/README.md @@ -0,0 +1,5 @@ +# Info + + +--- +## History diff --git a/publication/results/m1/alp_encode.csv b/publication/results/m1/alp_encode.csv new file mode 100644 index 0000000..c549e47 --- /dev/null +++ b/publication/results/m1/alp_encode.csv @@ -0,0 +1,32 @@ +benchmark_number,name,iterations,cycles_per_tuple +0,air_sensor_f_encode_simdized,300000,8.84598 +1,arade4_encode_simdized,300000,8.37173 +2,basel_temp_f_encode_simdized,300000,8.63165 +3,basel_wind_f_encode_simdized,300000,8.73768 +4,bird_migration_f_encode_simdized,300000,8.51327 +5,bitcoin_f_encode_simdized,300000,8.60084 +6,bitcoin_transactions_f_encode_simdized,300000,8.54 +7,city_temperature_f_encode_simdized,300000,8.4418 +8,cms1_encode_simdized,300000,8.56009 +9,cms9_encode_simdized,300000,8.3335 +10,cms25_encode_simdized,300000,8.5332 +11,food_prices_encode_simdized,300000,8.62882 +12,gov10_encode_simdized,300000,8.86406 +13,gov26_encode_simdized,300000,7.22254 +14,gov30_encode_simdized,300000,7.17026 +15,gov31_encode_simdized,300000,7.15777 +16,gov40_encode_simdized,300000,7.16952 +17,medicare1_encode_simdized,300000,8.70978 +18,medicare9_encode_simdized,300000,8.26711 +19,neon_air_pressure_encode_simdized,300000,8.10415 +20,neon_bio_temp_c_encode_simdized,300000,8.32454 +21,neon_dew_point_temp_encode_simdized,300000,8.42611 +22,neon_pm10_dust_encode_simdized,300000,8.21114 +23,neon_wind_dir_encode_simdized,300000,8.16675 +24,nyc29_encode_simdized,300000,8.5225 +25,poi_lat_encode_simdized,300000,9.41257 +26,poi_lon_encode_simdized,300000,9.25525 +27,ssd_hdd_benchmarks_f_encode_simdized,300000,8.53097 +28,stocks_de_encode_simdized,300000,8.28986 +29,stocks_uk_encode_simdized,300000,8.52196 +30,stocks_usa_c_encode_simdized,300000,8.39713 diff --git a/publication/results/m1/alp_encode.metadata b/publication/results/m1/alp_encode.metadata new file mode 100644 index 0000000..7603108 --- /dev/null +++ b/publication/results/m1/alp_encode.metadata @@ -0,0 +1,26 @@ +2023-04-08T21:30:09+02:00 +Run on (8 X 0.005724 MHz CPU s) +CPU Caches: + L1 Data 64 KiB (x8) + L1 Instruction 128 KiB (x8) + L2 Unified 4096 KiB (x2) +Load Average: 2.63, 5.07, 6.45 +cmake info: + source_dir: /Users/azim/CLionProjects/bench_ALP + cmake_osx_architectures + cmake_host_system_processor: arm64 + cmake_system_processor: arm64 + cmake_host_system_name: Darwin + cmake_system_name: Linux + cmake_c_compiler: /opt/homebrew/Cellar/llvm/15.0.7_1/bin/clang + cmake_cxx_compiler: /opt/homebrew/Cellar/llvm/15.0.7_1/bin/clang++ + cmake_cxx_compiler_id: Clang + cmake_cxx_compiler_version: 15.0.7 + cmake_crosscompiling: TRUE + cmake_cxx_flags_debug: -g + cmake_cxx_flags_release: -O3 -DNDEBUG + cmake_build_type: Release + cmake_toolchain_file: m1 +target info: + target_name: + target_compile_options: diff --git a/publication/results/m1/arm64v8_neon_intrinsic_1024_uf1_falp.csv b/publication/results/m1/arm64v8_neon_intrinsic_1024_uf1_falp.csv new file mode 100644 index 0000000..f8bd2e2 --- /dev/null +++ b/publication/results/m1/arm64v8_neon_intrinsic_1024_uf1_falp.csv @@ -0,0 +1,57 @@ +benchmark_number,name,iterations,cycles_per_tuple +1,arade4_fused,3000000,3.10851 +1,arade4,3000000,3.11517 +2,basel_temp_f_fused,3000000,3.12028 +2,basel_temp_f,3000000,3.26982 +3,basel_wind_f_fused,3000000,3.09247 +3,basel_wind_f,3000000,3.28476 +4,bird_migration_f_fused,3000000,3.10925 +4,bird_migration_f,3000000,3.19056 +5,bitcoin_f_fused,3000000,3.12429 +5,bitcoin_f,3000000,3.24996 +6,bitcoin_transactions_f_fused,3000000,3.10706 +6,bitcoin_transactions_f,3000000,3.25144 +7,city_temperature_f_fused,3000000,3.02706 +7,city_temperature_f,3000000,3.14816 +8,cms1_fused,3000000,3.1807 +8,cms1,3000000,3.40785 +9,cms9_fused,3000000,3.04735 +9,cms9,3000000,3.12608 +10,cms25_fused,3000000,3.11245 +10,cms25,3000000,3.31658 +11,food_prices_fused,3000000,3.08201 +11,food_prices,3000000,3.26517 +12,gov10_fused,3000000,3.14321 +12,gov10,3000000,3.32754 +13,gov26_fused,3000000,0.500149 +13,gov26,3000000,2.82894 +14,gov30_fused,3000000,0.506657 +14,gov30,3000000,2.82809 +15,gov31_fused,3000000,0.501133 +15,gov31,3000000,2.82253 +16,gov40_fused,3000000,0.504715 +16,gov40,3000000,2.82864 +17,medicare1_fused,3000000,3.15906 +17,medicare1,3000000,3.33292 +18,medicare9_fused,3000000,3.05402 +18,medicare9,3000000,3.12657 +19,neon_air_pressure_fused,3000000,3.33872 +19,neon_air_pressure,3000000,2.93629 +20,neon_bio_temp_c_fused,3000000,3.05781 +20,neon_bio_temp_c,3000000,3.14488 +21,neon_dew_point_temp_fused,3000000,3.06939 +21,neon_dew_point_temp,3000000,3.17439 +22,neon_pm10_dust_fused,3000000,3.20733 +22,neon_pm10_dust,3000000,3.02721 +23,neon_wind_dir_fused,3000000,3.31208 +23,neon_wind_dir,3000000,2.93779 +24,nyc29_fused,3000000,3.11899 +24,nyc29,3000000,3.31333 +27,ssd_hdd_benchmarks_f_fused,3000000,3.09582 +27,ssd_hdd_benchmarks_f,3000000,3.18826 +28,stocks_de_fused,3000000,3.03518 +28,stocks_de,3000000,3.12915 +29,stocks_uk_fused,3000000,3.0811 +29,stocks_uk,3000000,3.14248 +30,stocks_usa_c_fused,3000000,3.14353 +30,stocks_usa_c,3000000,3.14589 diff --git a/publication/results/m1/arm64v8_neon_intrinsic_1024_uf1_falp.metadata b/publication/results/m1/arm64v8_neon_intrinsic_1024_uf1_falp.metadata new file mode 100644 index 0000000..d89f843 --- /dev/null +++ b/publication/results/m1/arm64v8_neon_intrinsic_1024_uf1_falp.metadata @@ -0,0 +1,26 @@ +2023-04-12T11:36:07+02:00 +Run on (8 X 0.007993 MHz CPU s) +CPU Caches: + L1 Data 64 KiB (x8) + L1 Instruction 128 KiB (x8) + L2 Unified 4096 KiB (x2) +Load Average: 3.44, 3.60, 3.57 +cmake info: + source_dir: /Users/azim/CLionProjects/bench_ALP + cmake_osx_architectures + cmake_host_system_processor: arm64 + cmake_system_processor: arm64 + cmake_host_system_name: Darwin + cmake_system_name: Linux + cmake_c_compiler: /opt/homebrew/Cellar/llvm/15.0.7_1/bin/clang + cmake_cxx_compiler: /opt/homebrew/Cellar/llvm/15.0.7_1/bin/clang++ + cmake_cxx_compiler_id: Clang + cmake_cxx_compiler_version: 15.0.7 + cmake_crosscompiling: TRUE + cmake_cxx_flags_debug: -g + cmake_cxx_flags_release: -O3 -DNDEBUG + cmake_build_type: Release + cmake_toolchain_file: m1 +target info: + target_name: arm64v8_neon_intrinsic_1024_uf1_falp + target_compile_options: -fPIC;-O3 diff --git a/publication/results/m1/fallback_scalar_aav_1024_uf1_falp.csv b/publication/results/m1/fallback_scalar_aav_1024_uf1_falp.csv new file mode 100644 index 0000000..7cb18ea --- /dev/null +++ b/publication/results/m1/fallback_scalar_aav_1024_uf1_falp.csv @@ -0,0 +1,57 @@ +benchmark_number,name,iterations,cycles_per_tuple +1,arade4_fused,3000000,3.01443 +1,arade4,3000000,3.11251 +2,basel_temp_f_fused,3000000,3.00408 +2,basel_temp_f,3000000,3.27159 +3,basel_wind_f_fused,3000000,3.03308 +3,basel_wind_f,3000000,3.27592 +4,bird_migration_f_fused,3000000,2.96291 +4,bird_migration_f,3000000,3.1926 +5,bitcoin_f_fused,3000000,2.95112 +5,bitcoin_f,3000000,3.25334 +6,bitcoin_transactions_f_fused,3000000,3.21561 +6,bitcoin_transactions_f,3000000,3.2532 +7,city_temperature_f_fused,3000000,2.98688 +7,city_temperature_f,3000000,3.15162 +8,cms1_fused,3000000,2.85887 +8,cms1,3000000,3.38488 +9,cms9_fused,3000000,2.88033 +9,cms9,3000000,3.13448 +10,cms25_fused,3000000,3.03605 +10,cms25,3000000,3.31435 +11,food_prices_fused,3000000,3.02071 +11,food_prices,3000000,3.26588 +12,gov10_fused,3000000,3.07522 +12,gov10,3000000,3.34085 +13,gov26_fused,3000000,0.500418 +13,gov26,3000000,2.82071 +14,gov30_fused,3000000,0.509299 +14,gov30,3000000,2.82985 +15,gov31_fused,3000000,0.503309 +15,gov31,3000000,2.82858 +16,gov40_fused,3000000,0.50695 +16,gov40,3000000,2.82914 +17,medicare1_fused,3000000,3.13742 +17,medicare1,3000000,3.33423 +18,medicare9_fused,3000000,2.89201 +18,medicare9,3000000,3.12802 +19,neon_air_pressure_fused,3000000,2.87954 +19,neon_air_pressure,3000000,2.94052 +20,neon_bio_temp_c_fused,3000000,2.84923 +20,neon_bio_temp_c,3000000,3.12948 +21,neon_dew_point_temp_fused,3000000,2.97439 +21,neon_dew_point_temp,3000000,3.17315 +22,neon_pm10_dust_fused,3000000,2.97438 +22,neon_pm10_dust,3000000,3.02776 +23,neon_wind_dir_fused,3000000,2.87482 +23,neon_wind_dir,3000000,2.93649 +24,nyc29_fused,3000000,3.03591 +24,nyc29,3000000,3.31745 +27,ssd_hdd_benchmarks_f_fused,3000000,2.9105 +27,ssd_hdd_benchmarks_f,3000000,3.18804 +28,stocks_de_fused,3000000,2.89496 +28,stocks_de,3000000,3.13157 +29,stocks_uk_fused,3000000,2.95276 +29,stocks_uk,3000000,3.13991 +30,stocks_usa_c_fused,3000000,2.98749 +30,stocks_usa_c,3000000,3.14015 diff --git a/publication/results/m1/fallback_scalar_aav_1024_uf1_falp.metadata b/publication/results/m1/fallback_scalar_aav_1024_uf1_falp.metadata new file mode 100644 index 0000000..b82da31 --- /dev/null +++ b/publication/results/m1/fallback_scalar_aav_1024_uf1_falp.metadata @@ -0,0 +1,26 @@ +2023-04-12T12:08:43+02:00 +Run on (8 X 0.008753 MHz CPU s) +CPU Caches: + L1 Data 64 KiB (x8) + L1 Instruction 128 KiB (x8) + L2 Unified 4096 KiB (x2) +Load Average: 2.80, 3.76, 3.29 +cmake info: + source_dir: /Users/azim/CLionProjects/bench_ALP + cmake_osx_architectures + cmake_host_system_processor: arm64 + cmake_system_processor: arm64 + cmake_host_system_name: Darwin + cmake_system_name: Linux + cmake_c_compiler: /opt/homebrew/Cellar/llvm/15.0.7_1/bin/clang + cmake_cxx_compiler: /opt/homebrew/Cellar/llvm/15.0.7_1/bin/clang++ + cmake_cxx_compiler_id: Clang + cmake_cxx_compiler_version: 15.0.7 + cmake_crosscompiling: TRUE + cmake_cxx_flags_debug: -g + cmake_cxx_flags_release: -O3 -DNDEBUG + cmake_build_type: Release + cmake_toolchain_file: m1 +target info: + target_name: fallback_scalar_aav_1024_uf1_falp + target_compile_options: -fPIC;-O3 diff --git a/publication/results/m1/fallback_scalar_nav_1024_uf1_falp.csv b/publication/results/m1/fallback_scalar_nav_1024_uf1_falp.csv new file mode 100644 index 0000000..3963b41 --- /dev/null +++ b/publication/results/m1/fallback_scalar_nav_1024_uf1_falp.csv @@ -0,0 +1,57 @@ +benchmark_number,name,iterations,cycles_per_tuple +1,arade4_fused,3000000,2.22901 +1,arade4,3000000,3.06908 +2,basel_temp_f_fused,3000000,2.38581 +2,basel_temp_f,3000000,3.2152 +3,basel_wind_f_fused,3000000,2.3379 +3,basel_wind_f,3000000,3.22917 +4,bird_migration_f_fused,3000000,2.12804 +4,bird_migration_f,3000000,3.13741 +5,bitcoin_f_fused,3000000,2.28888 +5,bitcoin_f,3000000,3.20401 +6,bitcoin_transactions_f_fused,3000000,2.35188 +6,bitcoin_transactions_f,3000000,3.20649 +7,city_temperature_f_fused,3000000,2.12301 +7,city_temperature_f,3000000,3.09895 +8,cms1_fused,3000000,2.36371 +8,cms1,3000000,3.329 +9,cms9_fused,3000000,2.21759 +9,cms9,3000000,3.07776 +10,cms25_fused,3000000,2.359 +10,cms25,3000000,3.26137 +11,food_prices_fused,3000000,2.28153 +11,food_prices,3000000,3.21173 +12,gov10_fused,3000000,2.39551 +12,gov10,3000000,3.27395 +13,gov26_fused,3000000,1.00044 +13,gov26,3000000,2.77187 +14,gov30_fused,3000000,1.00805 +14,gov30,3000000,2.78176 +15,gov31_fused,3000000,1.00175 +15,gov31,3000000,2.77479 +16,gov40_fused,3000000,1.00589 +16,gov40,3000000,2.77945 +17,medicare1_fused,3000000,2.40907 +17,medicare1,3000000,3.2788 +18,medicare9_fused,3000000,2.21626 +18,medicare9,3000000,3.10709 +19,neon_air_pressure_fused,3000000,2.11277 +19,neon_air_pressure,3000000,2.88974 +20,neon_bio_temp_c_fused,3000000,2.2172 +20,neon_bio_temp_c,3000000,3.07627 +21,neon_dew_point_temp_fused,3000000,2.09038 +21,neon_dew_point_temp,3000000,3.11791 +22,neon_pm10_dust_fused,3000000,2.13456 +22,neon_pm10_dust,3000000,2.96838 +23,neon_wind_dir_fused,3000000,2.10436 +23,neon_wind_dir,3000000,2.8836 +24,nyc29_fused,3000000,2.36398 +24,nyc29,3000000,3.2586 +27,ssd_hdd_benchmarks_f_fused,3000000,2.11657 +27,ssd_hdd_benchmarks_f,3000000,3.13233 +28,stocks_de_fused,3000000,2.22452 +28,stocks_de,3000000,3.08412 +29,stocks_uk_fused,3000000,2.16014 +29,stocks_uk,3000000,3.09142 +30,stocks_usa_c_fused,3000000,2.00158 +30,stocks_usa_c,3000000,3.07478 diff --git a/publication/results/m1/fallback_scalar_nav_1024_uf1_falp.metadata b/publication/results/m1/fallback_scalar_nav_1024_uf1_falp.metadata new file mode 100644 index 0000000..e1defc7 --- /dev/null +++ b/publication/results/m1/fallback_scalar_nav_1024_uf1_falp.metadata @@ -0,0 +1,26 @@ +2023-04-12T11:07:16+02:00 +Run on (8 X 0.017268 MHz CPU s) +CPU Caches: + L1 Data 64 KiB (x8) + L1 Instruction 128 KiB (x8) + L2 Unified 4096 KiB (x2) +Load Average: 5.85, 3.85, 2.71 +cmake info: + source_dir: /Users/azim/CLionProjects/bench_ALP + cmake_osx_architectures + cmake_host_system_processor: arm64 + cmake_system_processor: arm64 + cmake_host_system_name: Darwin + cmake_system_name: Linux + cmake_c_compiler: /opt/homebrew/Cellar/llvm/15.0.7_1/bin/clang + cmake_cxx_compiler: /opt/homebrew/Cellar/llvm/15.0.7_1/bin/clang++ + cmake_cxx_compiler_id: Clang + cmake_cxx_compiler_version: 15.0.7 + cmake_crosscompiling: TRUE + cmake_cxx_flags_debug: -g + cmake_cxx_flags_release: -O3 -DNDEBUG + cmake_build_type: Release + cmake_toolchain_file: m1 +target info: + target_name: fallback_scalar_nav_1024_uf1_falp + target_compile_options: -fPIC;-O3;-fno-slp-vectorize;-fno-vectorize diff --git a/publication/results/m6a_xlarge/README.md b/publication/results/m6a_xlarge/README.md new file mode 100644 index 0000000..a7e9fc8 --- /dev/null +++ b/publication/results/m6a_xlarge/README.md @@ -0,0 +1,5 @@ +# Info + + +--- +## History diff --git a/publication/results/m6a_xlarge/fallback_scalar_aav_1024_uf1_falp.csv b/publication/results/m6a_xlarge/fallback_scalar_aav_1024_uf1_falp.csv new file mode 100644 index 0000000..971d662 --- /dev/null +++ b/publication/results/m6a_xlarge/fallback_scalar_aav_1024_uf1_falp.csv @@ -0,0 +1,63 @@ +benchmark_number,name,iterations,cycles_per_tuple +0,air_sensor_f_fused,3000000,2.90132 +0,air_sensor_f,3000000,1.77651 +1,arade4_fused,3000000,2.66948 +1,arade4,3000000,1.59806 +2,basel_temp_f_fused,3000000,2.77956 +2,basel_temp_f,3000000,1.66923 +3,basel_wind_f_fused,3000000,2.7787 +3,basel_wind_f,3000000,1.6623 +4,bird_migration_f_fused,3000000,2.7054 +4,bird_migration_f,3000000,1.61496 +5,bitcoin_f_fused,3000000,2.76026 +5,bitcoin_f,3000000,1.64704 +6,bitcoin_transactions_f_fused,3000000,2.78705 +6,bitcoin_transactions_f,3000000,1.66038 +7,city_temperature_f_fused,3000000,2.67641 +7,city_temperature_f,3000000,1.5978 +8,cms1_fused,3000000,2.83722 +8,cms1,3000000,1.70743 +9,cms9_fused,3000000,2.65707 +9,cms9,3000000,1.59492 +10,cms25_fused,3000000,2.83201 +10,cms25,3000000,1.69637 +11,food_prices_fused,3000000,2.75846 +11,food_prices,3000000,1.65201 +12,gov10_fused,3000000,2.91454 +12,gov10,3000000,1.80153 +13,gov26_fused,3000000,0.371504 +13,gov26,3000000,1.58019 +14,gov30_fused,3000000,0.374415 +14,gov30,3000000,1.58741 +15,gov31_fused,3000000,0.371728 +15,gov31,3000000,1.58429 +16,gov40_fused,3000000,0.372611 +16,gov40,3000000,1.58384 +17,medicare1_fused,3000000,2.84224 +17,medicare1,3000000,1.70676 +18,medicare9_fused,3000000,2.65631 +18,medicare9,3000000,1.5915 +19,neon_air_pressure_fused,3000000,2.54344 +19,neon_air_pressure,3000000,1.58334 +20,neon_bio_temp_c_fused,3000000,2.65533 +20,neon_bio_temp_c,3000000,1.58859 +21,neon_dew_point_temp_fused,3000000,2.67996 +21,neon_dew_point_temp,3000000,1.60656 +22,neon_pm10_dust_fused,3000000,2.56294 +22,neon_pm10_dust,3000000,1.58307 +23,neon_wind_dir_fused,3000000,2.54271 +23,neon_wind_dir,3000000,1.58038 +24,nyc29_fused,3000000,2.83194 +24,nyc29,3000000,1.69427 +25,poi_lat_fused,3000000,3.01528 +25,poi_lat,3000000,1.91255 +26,poi_lon_fused,3000000,3.01264 +26,poi_lon,3000000,1.90676 +27,ssd_hdd_benchmarks_f_fused,3000000,2.70166 +27,ssd_hdd_benchmarks_f,3000000,1.61143 +28,stocks_de_fused,3000000,2.66292 +28,stocks_de,3000000,1.59458 +29,stocks_uk_fused,3000000,2.65458 +29,stocks_uk,3000000,1.58932 +30,stocks_usa_c_fused,3000000,2.63371 +30,stocks_usa_c,3000000,1.58726 diff --git a/publication/results/m6a_xlarge/fallback_scalar_aav_1024_uf1_falp.metadata b/publication/results/m6a_xlarge/fallback_scalar_aav_1024_uf1_falp.metadata new file mode 100644 index 0000000..ae92b1a --- /dev/null +++ b/publication/results/m6a_xlarge/fallback_scalar_aav_1024_uf1_falp.metadata @@ -0,0 +1,27 @@ +2023-04-01T14:03:34+00:00 +Run on (4 X 3541.97 MHz CPU s) +CPU Caches: + L1 Data 32 KiB (x2) + L1 Instruction 32 KiB (x2) + L2 Unified 512 KiB (x2) + L3 Unified 8192 KiB (x1) +Load Average: 0.47, 0.38, 0.20 +cmake info: + source_dir: /home/ubuntu/bench_alp + cmake_osx_architectures + cmake_host_system_processor: x86_64 + cmake_system_processor: x86_64 + cmake_host_system_name: Linux + cmake_system_name: Linux + cmake_c_compiler: /usr/bin/clang + cmake_cxx_compiler: /usr/bin/clang++ + cmake_cxx_compiler_id: Clang + cmake_cxx_compiler_version: 14.0.0 + cmake_crosscompiling: TRUE + cmake_cxx_flags_debug: -g + cmake_cxx_flags_release: -O3 -DNDEBUG + cmake_build_type: Release + cmake_toolchain_file: m6a_xlarge +target info: + target_name: fallback_scalar_aav_1024_uf1_falp + target_compile_options: -O3 diff --git a/publication/results/m6a_xlarge/fallback_scalar_nav_1024_uf1_falp.csv b/publication/results/m6a_xlarge/fallback_scalar_nav_1024_uf1_falp.csv new file mode 100644 index 0000000..75f89d5 --- /dev/null +++ b/publication/results/m6a_xlarge/fallback_scalar_nav_1024_uf1_falp.csv @@ -0,0 +1,63 @@ +benchmark_number,name,iterations,cycles_per_tuple +0,air_sensor_f_fused,3000000,2.89534 +0,air_sensor_f,3000000,1.77287 +1,arade4_fused,3000000,2.66337 +1,arade4,3000000,1.59856 +2,basel_temp_f_fused,3000000,2.77053 +2,basel_temp_f,3000000,1.66851 +3,basel_wind_f_fused,3000000,2.77121 +3,basel_wind_f,3000000,1.66433 +4,bird_migration_f_fused,3000000,2.69844 +4,bird_migration_f,3000000,1.61781 +5,bitcoin_f_fused,3000000,2.75495 +5,bitcoin_f,3000000,1.64731 +6,bitcoin_transactions_f_fused,3000000,2.80703 +6,bitcoin_transactions_f,3000000,1.6641 +7,city_temperature_f_fused,3000000,2.66998 +7,city_temperature_f,3000000,1.59303 +8,cms1_fused,3000000,2.83312 +8,cms1,3000000,1.70266 +9,cms9_fused,3000000,2.65092 +9,cms9,3000000,1.60329 +10,cms25_fused,3000000,2.83054 +10,cms25,3000000,1.69795 +11,food_prices_fused,3000000,2.76334 +11,food_prices,3000000,1.65703 +12,gov10_fused,3000000,2.80781 +12,gov10,3000000,1.69864 +13,gov26_fused,3000000,0.371308 +13,gov26,3000000,1.58728 +14,gov30_fused,3000000,0.374497 +14,gov30,3000000,1.59397 +15,gov31_fused,3000000,0.371678 +15,gov31,3000000,1.5907 +16,gov40_fused,3000000,0.373749 +16,gov40,3000000,1.58823 +17,medicare1_fused,3000000,2.84118 +17,medicare1,3000000,1.71171 +18,medicare9_fused,3000000,2.65349 +18,medicare9,3000000,1.60143 +19,neon_air_pressure_fused,3000000,1.24682 +19,neon_air_pressure,3000000,1.58462 +20,neon_bio_temp_c_fused,3000000,2.64915 +20,neon_bio_temp_c,3000000,1.58949 +21,neon_dew_point_temp_fused,3000000,2.67631 +21,neon_dew_point_temp,3000000,1.60471 +22,neon_pm10_dust_fused,3000000,1.25322 +22,neon_pm10_dust,3000000,1.58591 +23,neon_wind_dir_fused,3000000,1.24637 +23,neon_wind_dir,3000000,1.58475 +24,nyc29_fused,3000000,2.83069 +24,nyc29,3000000,1.69559 +25,poi_lat_fused,3000000,3.02919 +25,poi_lat,3000000,1.92582 +26,poi_lon_fused,3000000,3.06686 +26,poi_lon,3000000,1.91633 +27,ssd_hdd_benchmarks_f_fused,3000000,2.70237 +27,ssd_hdd_benchmarks_f,3000000,1.61509 +28,stocks_de_fused,3000000,2.65821 +28,stocks_de,3000000,1.59535 +29,stocks_uk_fused,3000000,2.65434 +29,stocks_uk,3000000,1.59137 +30,stocks_usa_c_fused,3000000,2.63672 +30,stocks_usa_c,3000000,1.58866 diff --git a/publication/results/m6a_xlarge/fallback_scalar_nav_1024_uf1_falp.metadata b/publication/results/m6a_xlarge/fallback_scalar_nav_1024_uf1_falp.metadata new file mode 100644 index 0000000..46c7b2c --- /dev/null +++ b/publication/results/m6a_xlarge/fallback_scalar_nav_1024_uf1_falp.metadata @@ -0,0 +1,27 @@ +2023-04-01T14:06:35+00:00 +Run on (4 X 2972.05 MHz CPU s) +CPU Caches: + L1 Data 32 KiB (x2) + L1 Instruction 32 KiB (x2) + L2 Unified 512 KiB (x2) + L3 Unified 8192 KiB (x1) +Load Average: 0.57, 0.56, 0.31 +cmake info: + source_dir: /home/ubuntu/bench_alp + cmake_osx_architectures + cmake_host_system_processor: x86_64 + cmake_system_processor: x86_64 + cmake_host_system_name: Linux + cmake_system_name: Linux + cmake_c_compiler: /usr/bin/clang + cmake_cxx_compiler: /usr/bin/clang++ + cmake_cxx_compiler_id: Clang + cmake_cxx_compiler_version: 14.0.0 + cmake_crosscompiling: TRUE + cmake_cxx_flags_debug: -g + cmake_cxx_flags_release: -O3 -DNDEBUG + cmake_build_type: Release + cmake_toolchain_file: m6a_xlarge +target info: + target_name: fallback_scalar_nav_1024_uf1_falp + target_compile_options: -O3 diff --git a/publication/results/m6a_xlarge/x86_64_avx2_intrinsic_1024_uf1_falp.csv b/publication/results/m6a_xlarge/x86_64_avx2_intrinsic_1024_uf1_falp.csv new file mode 100644 index 0000000..f444b54 --- /dev/null +++ b/publication/results/m6a_xlarge/x86_64_avx2_intrinsic_1024_uf1_falp.csv @@ -0,0 +1,63 @@ +benchmark_number,name,iterations,cycles_per_tuple +0,air_sensor_f_fused,3000000,1.24232 +0,air_sensor_f,3000000,2.34048 +1,arade4_fused,3000000,1.0203 +1,arade4,3000000,2.16386 +2,basel_temp_f_fused,3000000,1.13238 +2,basel_temp_f,3000000,2.22609 +3,basel_wind_f_fused,3000000,1.14509 +3,basel_wind_f,3000000,2.22993 +4,bird_migration_f_fused,3000000,1.09402 +4,bird_migration_f,3000000,2.18599 +5,bitcoin_f_fused,3000000,1.13982 +5,bitcoin_f,3000000,2.21658 +6,bitcoin_transactions_f_fused,3000000,1.14926 +6,bitcoin_transactions_f,3000000,2.23225 +7,city_temperature_f_fused,3000000,1.05311 +7,city_temperature_f,3000000,2.16616 +8,cms1_fused,3000000,1.19185 +8,cms1,3000000,2.2777 +9,cms9_fused,3000000,1.04703 +9,cms9,3000000,2.17094 +10,cms25_fused,3000000,1.1983 +10,cms25,3000000,2.272 +11,food_prices_fused,3000000,1.13746 +11,food_prices,3000000,2.21348 +12,gov10_fused,3000000,1.16647 +12,gov10,3000000,2.24577 +13,gov26_fused,3000000,0.185645 +13,gov26,3000000,2.14029 +14,gov30_fused,3000000,0.188597 +14,gov30,3000000,2.14266 +15,gov31_fused,3000000,0.186375 +15,gov31,3000000,2.14279 +16,gov40_fused,3000000,0.187113 +16,gov40,3000000,2.14446 +17,medicare1_fused,3000000,1.17946 +17,medicare1,3000000,2.26742 +18,medicare9_fused,3000000,1.04836 +18,medicare9,3000000,2.17203 +19,neon_air_pressure_fused,3000000,1.01395 +19,neon_air_pressure,3000000,2.16317 +20,neon_bio_temp_c_fused,3000000,1.04561 +20,neon_bio_temp_c,3000000,2.16989 +21,neon_dew_point_temp_fused,3000000,1.07472 +21,neon_dew_point_temp,3000000,2.17691 +22,neon_pm10_dust_fused,3000000,0.974362 +22,neon_pm10_dust,3000000,2.16267 +23,neon_wind_dir_fused,3000000,1.01307 +23,neon_wind_dir,3000000,2.15834 +24,nyc29_fused,3000000,1.20021 +24,nyc29,3000000,2.27827 +25,poi_lat_fused,3000000,1.34907 +25,poi_lat,3000000,2.44453 +26,poi_lon_fused,3000000,1.2771 +26,poi_lon,3000000,2.42822 +27,ssd_hdd_benchmarks_f_fused,3000000,1.09457 +27,ssd_hdd_benchmarks_f,3000000,2.18429 +28,stocks_de_fused,3000000,1.04637 +28,stocks_de,3000000,2.16459 +29,stocks_uk_fused,3000000,1.04453 +29,stocks_uk,3000000,2.16487 +30,stocks_usa_c_fused,3000000,1.05671 +30,stocks_usa_c,3000000,2.15959 diff --git a/publication/results/m6a_xlarge/x86_64_avx2_intrinsic_1024_uf1_falp.metadata b/publication/results/m6a_xlarge/x86_64_avx2_intrinsic_1024_uf1_falp.metadata new file mode 100644 index 0000000..60ed8ca --- /dev/null +++ b/publication/results/m6a_xlarge/x86_64_avx2_intrinsic_1024_uf1_falp.metadata @@ -0,0 +1,27 @@ +2023-04-01T13:59:40+00:00 +Run on (4 X 3309.51 MHz CPU s) +CPU Caches: + L1 Data 32 KiB (x2) + L1 Instruction 32 KiB (x2) + L2 Unified 512 KiB (x2) + L3 Unified 8192 KiB (x1) +Load Average: 0.00, 0.11, 0.08 +cmake info: + source_dir: /home/ubuntu/bench_alp + cmake_osx_architectures + cmake_host_system_processor: x86_64 + cmake_system_processor: x86_64 + cmake_host_system_name: Linux + cmake_system_name: Linux + cmake_c_compiler: /usr/bin/clang + cmake_cxx_compiler: /usr/bin/clang++ + cmake_cxx_compiler_id: Clang + cmake_cxx_compiler_version: 14.0.0 + cmake_crosscompiling: TRUE + cmake_cxx_flags_debug: -g + cmake_cxx_flags_release: -O3 -DNDEBUG + cmake_build_type: Release + cmake_toolchain_file: m6a_xlarge +target info: + target_name: x86_64_avx2_intrinsic_1024_uf1_falp + target_compile_options: -mavx2 diff --git a/publication/results/president/fallback_scalar_aav_1024_uf1_falp.csv b/publication/results/president/fallback_scalar_aav_1024_uf1_falp.csv new file mode 100644 index 0000000..e371016 --- /dev/null +++ b/publication/results/president/fallback_scalar_aav_1024_uf1_falp.csv @@ -0,0 +1,63 @@ +benchmark_number,name,iterations,cycles_per_tuple +0,air_sensor_f_fused,300000,0.712891 +0,air_sensor_f,300000,1.04608 +1,arade4_fused,300000,0.50305 +1,arade4,300000,0.807323 +2,basel_temp_f_fused,300000,0.530896 +2,basel_temp_f,300000,0.925821 +3,basel_wind_f_fused,300000,0.478294 +3,basel_wind_f,300000,0.883148 +4,bird_migration_f_fused,300000,0.528204 +4,bird_migration_f,300000,0.865669 +5,bitcoin_f_fused,300000,0.689692 +5,bitcoin_f,300000,0.991247 +6,bitcoin_transactions_f_fused,300000,0.58093 +6,bitcoin_transactions_f,300000,0.992974 +7,city_temperature_f_fused,300000,0.46024 +7,city_temperature_f,300000,0.799294 +8,cms1_fused,300000,0.518135 +8,cms1,300000,0.948051 +9,cms9_fused,300000,0.460172 +9,cms9,300000,0.799183 +10,cms25_fused,300000,0.512336 +10,cms25,300000,0.974005 +11,food_prices_fused,300000,0.60797 +11,food_prices,300000,0.983582 +12,gov10_fused,300000,0.617295 +12,gov10,300000,0.980526 +13,gov26_fused,300000,0.0928684 +13,gov26,300000,0.658075 +14,gov30_fused,300000,0.557516 +14,gov30,300000,0.881929 +15,gov31_fused,300000,0.0938745 +15,gov31,300000,0.655787 +16,gov40_fused,300000,0.504208 +16,gov40,300000,0.83337 +17,medicare1_fused,300000,0.752395 +17,medicare1,300000,1.09797 +18,medicare9_fused,300000,0.460114 +18,medicare9,300000,0.799611 +19,neon_air_pressure_fused,300000,0.633052 +19,neon_air_pressure,300000,0.892029 +20,neon_bio_temp_c_fused,300000,0.443757 +20,neon_bio_temp_c,300000,0.787927 +21,neon_dew_point_temp_fused,300000,0.470721 +21,neon_dew_point_temp,300000,0.811622 +22,neon_pm10_dust_fused,300000,0.425844 +22,neon_pm10_dust,300000,0.742215 +23,neon_wind_dir_fused,300000,0.45823 +23,neon_wind_dir,300000,0.715707 +24,nyc29_fused,300000,0.484059 +24,nyc29,300000,0.950653 +25,poi_lat_fused,300000,0.580862 +25,poi_lat,300000,1.02456 +26,poi_lon_fused,300000,0.526686 +26,poi_lon,300000,0.98664 +27,ssd_hdd_benchmarks_f_fused,300000,0.474549 +27,ssd_hdd_benchmarks_f,300000,0.823877 +28,stocks_de_fused,300000,0.449022 +28,stocks_de,300000,0.789811 +29,stocks_uk_fused,300000,0.460805 +29,stocks_uk,300000,0.798373 +30,stocks_usa_c_fused,300000,0.443708 +30,stocks_usa_c,300000,0.787847 diff --git a/publication/results/president/fallback_scalar_aav_1024_uf1_falp.metadata b/publication/results/president/fallback_scalar_aav_1024_uf1_falp.metadata new file mode 100644 index 0000000..077363a --- /dev/null +++ b/publication/results/president/fallback_scalar_aav_1024_uf1_falp.metadata @@ -0,0 +1,28 @@ +2023-02-26T17:03:36+01:00 +Run on (10 X 4500 MHz CPU s) +CPU Caches: + L1 Data 32 KiB (x10) + L1 Instruction 32 KiB (x10) + L2 Unified 1024 KiB (x10) + L3 Unified 14080 KiB (x1) +Load Average: 0.04, 0.29, 0.24 +***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead. +cmake info: + source_dir: /export/scratch1/azim/bench_ALP + cmake_osx_architectures + cmake_host_system_processor: x86_64 + cmake_system_processor: x86_64 + cmake_host_system_name: Linux + cmake_system_name: Linux + cmake_c_compiler: /ufs/azim/clang+llvm-13.0.0-x86_64-linux-gnu-ubuntu-16.04/bin/clang + cmake_cxx_compiler: /ufs/azim/clang+llvm-13.0.0-x86_64-linux-gnu-ubuntu-16.04/bin/clang++ + cmake_cxx_compiler_id: Clang + cmake_cxx_compiler_version: 13.0.0 + cmake_crosscompiling: TRUE + cmake_cxx_flags_debug: -g + cmake_cxx_flags_release: -O3 -DNDEBUG -std=c++17 -stdlib=libstdc++ + cmake_build_type: Release + cmake_toolchain_file: president +target info: + target_name: fallback_scalar_aav_1024_uf1_falp + target_compile_options: -mavx512dq diff --git a/publication/results/president/x86_64_avx512bw_intrinsic_1024_uf1_falp.csv b/publication/results/president/x86_64_avx512bw_intrinsic_1024_uf1_falp.csv new file mode 100644 index 0000000..b8187a5 --- /dev/null +++ b/publication/results/president/x86_64_avx512bw_intrinsic_1024_uf1_falp.csv @@ -0,0 +1,63 @@ +benchmark_number,name,iterations,cycles_per_tuple +0,air_sensor_f_fused,3000000,0.755931 +0,air_sensor_f,3000000,1.52339 +1,arade4_fused,3000000,0.581077 +1,arade4,3000000,1.13487 +2,basel_temp_f_fused,3000000,0.659132 +2,basel_temp_f,3000000,1.26027 +3,basel_wind_f_fused,3000000,0.652113 +3,basel_wind_f,3000000,1.23449 +4,bird_migration_f_fused,3000000,0.592453 +4,bird_migration_f,3000000,1.15142 +5,bitcoin_f_fused,3000000,0.620342 +5,bitcoin_f,3000000,1.22688 +6,bitcoin_transactions_f_fused,3000000,0.62753 +6,bitcoin_transactions_f,3000000,1.23362 +7,city_temperature_f_fused,3000000,0.567959 +7,city_temperature_f,3000000,1.11211 +8,cms1_fused,3000000,0.686547 +8,cms1,3000000,1.32294 +9,cms9_fused,3000000,0.564495 +9,cms9,3000000,1.10781 +10,cms25_fused,3000000,0.653321 +10,cms25,3000000,1.33992 +11,food_prices_fused,3000000,0.644977 +11,food_prices,3000000,1.25137 +12,gov10_fused,3000000,0.68484 +12,gov10,3000000,1.27757 +13,gov26_fused,3000000,0.126681 +13,gov26,3000000,0.855085 +14,gov30_fused,3000000,0.130608 +14,gov30,3000000,0.85754 +15,gov31_fused,3000000,0.127671 +15,gov31,3000000,0.861776 +16,gov40_fused,3000000,0.129627 +16,gov40,3000000,0.862454 +17,medicare1_fused,3000000,0.68763 +17,medicare1,3000000,1.34266 +18,medicare9_fused,3000000,0.569565 +18,medicare9,3000000,1.11179 +19,neon_air_pressure_fused,3000000,0.534937 +19,neon_air_pressure,3000000,1.01203 +20,neon_bio_temp_c_fused,3000000,0.559747 +20,neon_bio_temp_c,3000000,1.11398 +21,neon_dew_point_temp_fused,3000000,0.592601 +21,neon_dew_point_temp,3000000,1.12005 +22,neon_pm10_dust_fused,3000000,0.53699 +22,neon_pm10_dust,3000000,1.01887 +23,neon_wind_dir_fused,3000000,0.528438 +23,neon_wind_dir,3000000,1.02057 +24,nyc29_fused,3000000,0.652041 +24,nyc29,3000000,1.3388 +25,poi_lat_fused,3000000,0.936758 +25,poi_lat,3000000,1.68481 +26,poi_lon_fused,3000000,0.948398 +26,poi_lon,3000000,1.63669 +27,ssd_hdd_benchmarks_f_fused,3000000,0.588464 +27,ssd_hdd_benchmarks_f,3000000,1.14705 +28,stocks_de_fused,3000000,0.56686 +28,stocks_de,3000000,1.10916 +29,stocks_uk_fused,3000000,0.561063 +29,stocks_uk,3000000,1.08748 +30,stocks_usa_c_fused,3000000,0.550465 +30,stocks_usa_c,3000000,1.08826 diff --git a/publication/results/president/x86_64_avx512bw_intrinsic_1024_uf1_falp.metadata b/publication/results/president/x86_64_avx512bw_intrinsic_1024_uf1_falp.metadata new file mode 100644 index 0000000..d7e27c3 --- /dev/null +++ b/publication/results/president/x86_64_avx512bw_intrinsic_1024_uf1_falp.metadata @@ -0,0 +1,27 @@ +2023-03-30T17:02:15+02:00 +Run on (10 X 3299.67 MHz CPU s) +CPU Caches: + L1 Data 32 KiB (x10) + L1 Instruction 32 KiB (x10) + L2 Unified 1024 KiB (x10) + L3 Unified 14080 KiB (x1) +Load Average: 0.08, 0.06, 0.02 +cmake info: + source_dir: /export/scratch1/azim/bench_ALP + cmake_osx_architectures + cmake_host_system_processor: x86_64 + cmake_system_processor: x86_64 + cmake_host_system_name: Linux + cmake_system_name: Linux + cmake_c_compiler: /usr/lib64/ccache/clang + cmake_cxx_compiler: /usr/lib64/ccache/clang++ + cmake_cxx_compiler_id: Clang + cmake_cxx_compiler_version: 14.0.5 + cmake_crosscompiling: FALSE + cmake_cxx_flags_debug: -g + cmake_cxx_flags_release: -O3 -DNDEBUG -std=c++17 -stdlib=libstdc++ + cmake_build_type: Release + cmake_toolchain_file: president +target info: + target_name: x86_64_avx512bw_intrinsic_1024_uf1_falp + target_compile_options: -mavx512dq diff --git a/publication/tmp/i4i_4xlarge/README.md b/publication/tmp/i4i_4xlarge/README.md new file mode 100644 index 0000000..6be420f --- /dev/null +++ b/publication/tmp/i4i_4xlarge/README.md @@ -0,0 +1,8 @@ +# Info +[i4i](https://aws.amazon.com/ec2/instance-types/i4i/): +- I4i instances are powered by the latest generation Intel Xeon Scalable (Ice Lake) Processors with an all-core turbo frequency of 3.5 GHz. + + +--- +## History +1. [link](https://www.cpubenchmark.net/cpu.php?cpu=Intel+Xeon+Platinum+8375C+%40+2.90GHz&id=4486) \ No newline at end of file diff --git a/publication/tmp/i4i_4xlarge/x86_64_avx512bw_intrinsic_1024_uf1_falp.csv b/publication/tmp/i4i_4xlarge/x86_64_avx512bw_intrinsic_1024_uf1_falp.csv new file mode 100644 index 0000000..b6499f8 --- /dev/null +++ b/publication/tmp/i4i_4xlarge/x86_64_avx512bw_intrinsic_1024_uf1_falp.csv @@ -0,0 +1,63 @@ +benchmark_number,name,iterations,cycles_per_tuple +0,air_sensor_f_fused,3000000,0.793257 +0,air_sensor_f,3000000,1.33114 +1,arade4_fused,3000000,0.569469 +1,arade4,3000000,0.978004 +2,basel_temp_f_fused,3000000,0.647115 +2,basel_temp_f,3000000,1.10038 +3,basel_wind_f_fused,3000000,0.635952 +3,basel_wind_f,3000000,1.09618 +4,bird_migration_f_fused,3000000,0.585521 +4,bird_migration_f,3000000,1.02182 +5,bitcoin_f_fused,3000000,0.612594 +5,bitcoin_f,3000000,1.07196 +6,bitcoin_transactions_f_fused,3000000,0.616801 +6,bitcoin_transactions_f,3000000,1.08706 +7,city_temperature_f_fused,3000000,0.565435 +7,city_temperature_f,3000000,0.989905 +8,cms1_fused,3000000,0.666972 +8,cms1,3000000,1.17086 +9,cms9_fused,3000000,0.563108 +9,cms9,3000000,0.97639 +10,cms25_fused,3000000,0.640577 +10,cms25,3000000,1.15707 +11,food_prices_fused,3000000,0.624783 +11,food_prices,3000000,1.08129 +12,gov10_fused,3000000,0.676186 +12,gov10,3000000,1.13762 +13,gov26_fused,3000000,0.108055 +13,gov26,3000000,0.833727 +14,gov30_fused,3000000,0.111435 +14,gov30,3000000,0.836312 +15,gov31_fused,3000000,0.108908 +15,gov31,3000000,0.836335 +16,gov40_fused,3000000,0.109792 +16,gov40,3000000,0.838302 +17,medicare1_fused,3000000,0.665111 +17,medicare1,3000000,1.17772 +18,medicare9_fused,3000000,0.567326 +18,medicare9,3000000,0.976106 +19,neon_air_pressure_fused,3000000,0.513865 +19,neon_air_pressure,3000000,0.867196 +20,neon_bio_temp_c_fused,3000000,0.558718 +20,neon_bio_temp_c,3000000,0.97441 +21,neon_dew_point_temp_fused,3000000,0.579757 +21,neon_dew_point_temp,3000000,0.999814 +22,neon_pm10_dust_fused,3000000,0.525327 +22,neon_pm10_dust,3000000,0.883437 +23,neon_wind_dir_fused,3000000,0.509356 +23,neon_wind_dir,3000000,0.86718 +24,nyc29_fused,3000000,0.640688 +24,nyc29,3000000,1.15975 +25,poi_lat_fused,3000000,0.930615 +25,poi_lat,3000000,1.46123 +26,poi_lon_fused,3000000,0.882995 +26,poi_lon,3000000,1.40789 +27,ssd_hdd_benchmarks_f_fused,3000000,0.581742 +27,ssd_hdd_benchmarks_f,3000000,1.02073 +28,stocks_de_fused,3000000,0.563002 +28,stocks_de,3000000,0.976846 +29,stocks_uk_fused,3000000,0.559865 +29,stocks_uk,3000000,0.977135 +30,stocks_usa_c_fused,3000000,0.554831 +30,stocks_usa_c,3000000,0.970758 diff --git a/publication/tmp/i4i_4xlarge/x86_64_avx512bw_intrinsic_1024_uf1_falp.metadata b/publication/tmp/i4i_4xlarge/x86_64_avx512bw_intrinsic_1024_uf1_falp.metadata new file mode 100644 index 0000000..bd5eb76 --- /dev/null +++ b/publication/tmp/i4i_4xlarge/x86_64_avx512bw_intrinsic_1024_uf1_falp.metadata @@ -0,0 +1,27 @@ +2023-03-30T15:57:18+00:00 +Run on (16 X 2899.98 MHz CPU s) +CPU Caches: + L1 Data 48 KiB (x8) + L1 Instruction 32 KiB (x8) + L2 Unified 1280 KiB (x8) + L3 Unified 55296 KiB (x1) +Load Average: 0.31, 0.08, 0.02 +cmake info: + source_dir: /home/ubuntu/bench_ALP + cmake_osx_architectures + cmake_host_system_processor: x86_64 + cmake_system_processor: x86_64 + cmake_host_system_name: Linux + cmake_system_name: Linux + cmake_c_compiler: /usr/bin/clang + cmake_cxx_compiler: /usr/bin/clang++ + cmake_cxx_compiler_id: Clang + cmake_cxx_compiler_version: 14.0.0 + cmake_crosscompiling: TRUE + cmake_cxx_flags_debug: -g + cmake_cxx_flags_release: -O3 -DNDEBUG -std=c++17 -stdlib=libstdc++ + cmake_build_type: Release + cmake_toolchain_file: i4i_4xlarge +target info: + target_name: x86_64_avx512bw_intrinsic_1024_uf1_falp + target_compile_options: -mavx512dq diff --git a/publication/tmp/president/fallback_scalar_aav_1024_uf1_falp.csv b/publication/tmp/president/fallback_scalar_aav_1024_uf1_falp.csv new file mode 100644 index 0000000..e371016 --- /dev/null +++ b/publication/tmp/president/fallback_scalar_aav_1024_uf1_falp.csv @@ -0,0 +1,63 @@ +benchmark_number,name,iterations,cycles_per_tuple +0,air_sensor_f_fused,300000,0.712891 +0,air_sensor_f,300000,1.04608 +1,arade4_fused,300000,0.50305 +1,arade4,300000,0.807323 +2,basel_temp_f_fused,300000,0.530896 +2,basel_temp_f,300000,0.925821 +3,basel_wind_f_fused,300000,0.478294 +3,basel_wind_f,300000,0.883148 +4,bird_migration_f_fused,300000,0.528204 +4,bird_migration_f,300000,0.865669 +5,bitcoin_f_fused,300000,0.689692 +5,bitcoin_f,300000,0.991247 +6,bitcoin_transactions_f_fused,300000,0.58093 +6,bitcoin_transactions_f,300000,0.992974 +7,city_temperature_f_fused,300000,0.46024 +7,city_temperature_f,300000,0.799294 +8,cms1_fused,300000,0.518135 +8,cms1,300000,0.948051 +9,cms9_fused,300000,0.460172 +9,cms9,300000,0.799183 +10,cms25_fused,300000,0.512336 +10,cms25,300000,0.974005 +11,food_prices_fused,300000,0.60797 +11,food_prices,300000,0.983582 +12,gov10_fused,300000,0.617295 +12,gov10,300000,0.980526 +13,gov26_fused,300000,0.0928684 +13,gov26,300000,0.658075 +14,gov30_fused,300000,0.557516 +14,gov30,300000,0.881929 +15,gov31_fused,300000,0.0938745 +15,gov31,300000,0.655787 +16,gov40_fused,300000,0.504208 +16,gov40,300000,0.83337 +17,medicare1_fused,300000,0.752395 +17,medicare1,300000,1.09797 +18,medicare9_fused,300000,0.460114 +18,medicare9,300000,0.799611 +19,neon_air_pressure_fused,300000,0.633052 +19,neon_air_pressure,300000,0.892029 +20,neon_bio_temp_c_fused,300000,0.443757 +20,neon_bio_temp_c,300000,0.787927 +21,neon_dew_point_temp_fused,300000,0.470721 +21,neon_dew_point_temp,300000,0.811622 +22,neon_pm10_dust_fused,300000,0.425844 +22,neon_pm10_dust,300000,0.742215 +23,neon_wind_dir_fused,300000,0.45823 +23,neon_wind_dir,300000,0.715707 +24,nyc29_fused,300000,0.484059 +24,nyc29,300000,0.950653 +25,poi_lat_fused,300000,0.580862 +25,poi_lat,300000,1.02456 +26,poi_lon_fused,300000,0.526686 +26,poi_lon,300000,0.98664 +27,ssd_hdd_benchmarks_f_fused,300000,0.474549 +27,ssd_hdd_benchmarks_f,300000,0.823877 +28,stocks_de_fused,300000,0.449022 +28,stocks_de,300000,0.789811 +29,stocks_uk_fused,300000,0.460805 +29,stocks_uk,300000,0.798373 +30,stocks_usa_c_fused,300000,0.443708 +30,stocks_usa_c,300000,0.787847 diff --git a/publication/tmp/president/fallback_scalar_aav_1024_uf1_falp.metadata b/publication/tmp/president/fallback_scalar_aav_1024_uf1_falp.metadata new file mode 100644 index 0000000..077363a --- /dev/null +++ b/publication/tmp/president/fallback_scalar_aav_1024_uf1_falp.metadata @@ -0,0 +1,28 @@ +2023-02-26T17:03:36+01:00 +Run on (10 X 4500 MHz CPU s) +CPU Caches: + L1 Data 32 KiB (x10) + L1 Instruction 32 KiB (x10) + L2 Unified 1024 KiB (x10) + L3 Unified 14080 KiB (x1) +Load Average: 0.04, 0.29, 0.24 +***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead. +cmake info: + source_dir: /export/scratch1/azim/bench_ALP + cmake_osx_architectures + cmake_host_system_processor: x86_64 + cmake_system_processor: x86_64 + cmake_host_system_name: Linux + cmake_system_name: Linux + cmake_c_compiler: /ufs/azim/clang+llvm-13.0.0-x86_64-linux-gnu-ubuntu-16.04/bin/clang + cmake_cxx_compiler: /ufs/azim/clang+llvm-13.0.0-x86_64-linux-gnu-ubuntu-16.04/bin/clang++ + cmake_cxx_compiler_id: Clang + cmake_cxx_compiler_version: 13.0.0 + cmake_crosscompiling: TRUE + cmake_cxx_flags_debug: -g + cmake_cxx_flags_release: -O3 -DNDEBUG -std=c++17 -stdlib=libstdc++ + cmake_build_type: Release + cmake_toolchain_file: president +target info: + target_name: fallback_scalar_aav_1024_uf1_falp + target_compile_options: -mavx512dq diff --git a/publication/tmp/president/x86_64_avx512bw_intrinsic_1024_uf1_falp.csv b/publication/tmp/president/x86_64_avx512bw_intrinsic_1024_uf1_falp.csv new file mode 100644 index 0000000..b8187a5 --- /dev/null +++ b/publication/tmp/president/x86_64_avx512bw_intrinsic_1024_uf1_falp.csv @@ -0,0 +1,63 @@ +benchmark_number,name,iterations,cycles_per_tuple +0,air_sensor_f_fused,3000000,0.755931 +0,air_sensor_f,3000000,1.52339 +1,arade4_fused,3000000,0.581077 +1,arade4,3000000,1.13487 +2,basel_temp_f_fused,3000000,0.659132 +2,basel_temp_f,3000000,1.26027 +3,basel_wind_f_fused,3000000,0.652113 +3,basel_wind_f,3000000,1.23449 +4,bird_migration_f_fused,3000000,0.592453 +4,bird_migration_f,3000000,1.15142 +5,bitcoin_f_fused,3000000,0.620342 +5,bitcoin_f,3000000,1.22688 +6,bitcoin_transactions_f_fused,3000000,0.62753 +6,bitcoin_transactions_f,3000000,1.23362 +7,city_temperature_f_fused,3000000,0.567959 +7,city_temperature_f,3000000,1.11211 +8,cms1_fused,3000000,0.686547 +8,cms1,3000000,1.32294 +9,cms9_fused,3000000,0.564495 +9,cms9,3000000,1.10781 +10,cms25_fused,3000000,0.653321 +10,cms25,3000000,1.33992 +11,food_prices_fused,3000000,0.644977 +11,food_prices,3000000,1.25137 +12,gov10_fused,3000000,0.68484 +12,gov10,3000000,1.27757 +13,gov26_fused,3000000,0.126681 +13,gov26,3000000,0.855085 +14,gov30_fused,3000000,0.130608 +14,gov30,3000000,0.85754 +15,gov31_fused,3000000,0.127671 +15,gov31,3000000,0.861776 +16,gov40_fused,3000000,0.129627 +16,gov40,3000000,0.862454 +17,medicare1_fused,3000000,0.68763 +17,medicare1,3000000,1.34266 +18,medicare9_fused,3000000,0.569565 +18,medicare9,3000000,1.11179 +19,neon_air_pressure_fused,3000000,0.534937 +19,neon_air_pressure,3000000,1.01203 +20,neon_bio_temp_c_fused,3000000,0.559747 +20,neon_bio_temp_c,3000000,1.11398 +21,neon_dew_point_temp_fused,3000000,0.592601 +21,neon_dew_point_temp,3000000,1.12005 +22,neon_pm10_dust_fused,3000000,0.53699 +22,neon_pm10_dust,3000000,1.01887 +23,neon_wind_dir_fused,3000000,0.528438 +23,neon_wind_dir,3000000,1.02057 +24,nyc29_fused,3000000,0.652041 +24,nyc29,3000000,1.3388 +25,poi_lat_fused,3000000,0.936758 +25,poi_lat,3000000,1.68481 +26,poi_lon_fused,3000000,0.948398 +26,poi_lon,3000000,1.63669 +27,ssd_hdd_benchmarks_f_fused,3000000,0.588464 +27,ssd_hdd_benchmarks_f,3000000,1.14705 +28,stocks_de_fused,3000000,0.56686 +28,stocks_de,3000000,1.10916 +29,stocks_uk_fused,3000000,0.561063 +29,stocks_uk,3000000,1.08748 +30,stocks_usa_c_fused,3000000,0.550465 +30,stocks_usa_c,3000000,1.08826 diff --git a/publication/tmp/president/x86_64_avx512bw_intrinsic_1024_uf1_falp.metadata b/publication/tmp/president/x86_64_avx512bw_intrinsic_1024_uf1_falp.metadata new file mode 100644 index 0000000..d7e27c3 --- /dev/null +++ b/publication/tmp/president/x86_64_avx512bw_intrinsic_1024_uf1_falp.metadata @@ -0,0 +1,27 @@ +2023-03-30T17:02:15+02:00 +Run on (10 X 3299.67 MHz CPU s) +CPU Caches: + L1 Data 32 KiB (x10) + L1 Instruction 32 KiB (x10) + L2 Unified 1024 KiB (x10) + L3 Unified 14080 KiB (x1) +Load Average: 0.08, 0.06, 0.02 +cmake info: + source_dir: /export/scratch1/azim/bench_ALP + cmake_osx_architectures + cmake_host_system_processor: x86_64 + cmake_system_processor: x86_64 + cmake_host_system_name: Linux + cmake_system_name: Linux + cmake_c_compiler: /usr/lib64/ccache/clang + cmake_cxx_compiler: /usr/lib64/ccache/clang++ + cmake_cxx_compiler_id: Clang + cmake_cxx_compiler_version: 14.0.5 + cmake_crosscompiling: FALSE + cmake_cxx_flags_debug: -g + cmake_cxx_flags_release: -O3 -DNDEBUG -std=c++17 -stdlib=libstdc++ + cmake_build_type: Release + cmake_toolchain_file: president +target info: + target_name: x86_64_avx512bw_intrinsic_1024_uf1_falp + target_compile_options: -mavx512dq diff --git a/publication/zstd_compression_ratio.csv b/publication/zstd_compression_ratio.csv new file mode 100644 index 0000000..2eaa8e8 --- /dev/null +++ b/publication/zstd_compression_ratio.csv @@ -0,0 +1,31 @@ +dataset,size +Air-Pressure,9.39 +Arade/4,33.90 +Basel-Temp,18.44 +Basel-Wind,14.66 +Bird-Mig,21.02 +Btc-Price,42.08 +Blockchain,43.97 +City-Temp,16.77 +CMS/1,26.56 +CMS/9,14.73 +CMS/25,58.27 +Dew-Temp,25.07 +Bio-Temp,17.46 +Food-prices,18.32 +Gov/10,28.09 +Gov/26,0.23 +Gov/30,4.48 +Gov/31,1.63 +Gov/40,0.46 +Medicare/1,31.18 +Medicare/9,15.03 +PM10-dust,7.78 +NYC/29,27.50 +POI-lat,59.34 +POI-lon,60.98 +SD-bench,11.34 +Stocks-DE,10.54 +Stocks-UK,10.28 +Stocks-USA,8.56 +Wind-dir,25.53 diff --git a/scripts/run-clang-format.py b/scripts/run-clang-format.py new file mode 100644 index 0000000..217e16b --- /dev/null +++ b/scripts/run-clang-format.py @@ -0,0 +1,411 @@ +#!/usr/bin/env python +"""A wrapper script around clang-format, suitable for linting multiple files +and to use for continuous integration. + +This is an alternative API for the clang-format command line. +It runs over multiple files and directories in parallel. +A diff output is produced and a sensible exit code is returned. + +""" + +# FROM https://github.com/Sarcasm/run-clang-format + +from __future__ import print_function, unicode_literals + +import argparse +import codecs +import difflib +import fnmatch +import io +import errno +import multiprocessing +import os +import signal +import subprocess +import sys +import traceback + +from functools import partial + +try: + from subprocess import DEVNULL # py3k +except ImportError: + DEVNULL = open(os.devnull, "wb") + +DEFAULT_EXTENSIONS = 'c,h,C,H,cpp,hpp,cc,hh,c++,h++,cxx,hxx' +DEFAULT_CLANG_FORMAT_IGNORE = '.clang-format-ignore' + + +class ExitStatus: + SUCCESS = 0 + DIFF = 1 + TROUBLE = 2 + + +def excludes_from_file(ignore_file): + excludes = [] + try: + with io.open(ignore_file, 'r', encoding='utf-8') as f: + for line in f: + if line.startswith('#'): + # ignore comments + continue + pattern = line.rstrip() + if not pattern: + # allow empty lines + continue + excludes.append(pattern) + except EnvironmentError as e: + if e.errno != errno.ENOENT: + raise + return excludes; + + +def list_files(files, recursive=False, extensions=None, exclude=None): + if extensions is None: + extensions = [] + if exclude is None: + exclude = [] + + out = [] + for file in files: + if recursive and os.path.isdir(file): + for dirpath, dnames, fnames in os.walk(file): + fpaths = [os.path.join(dirpath, fname) for fname in fnames] + for pattern in exclude: + # os.walk() supports trimming down the dnames list + # by modifying it in-place, + # to avoid unnecessary directory listings. + dnames[:] = [ + x for x in dnames + if + not fnmatch.fnmatch(os.path.join(dirpath, x), pattern) + ] + fpaths = [ + x for x in fpaths if not fnmatch.fnmatch(x, pattern) + ] + for f in fpaths: + ext = os.path.splitext(f)[1][1:] + if ext in extensions: + out.append(f) + else: + out.append(file) + return out + + +def make_diff(file, original, reformatted): + return list( + difflib.unified_diff( + original, + reformatted, + fromfile='{}\t(original)'.format(file), + tofile='{}\t(reformatted)'.format(file), + n=3)) + + +class DiffError(Exception): + def __init__(self, message, errs=None): + super(DiffError, self).__init__(message) + self.errs = errs or [] + + +class UnexpectedError(Exception): + def __init__(self, message, exc=None): + super(UnexpectedError, self).__init__(message) + self.formatted_traceback = traceback.format_exc() + self.exc = exc + + +def run_clang_format_diff_wrapper(args, file): + try: + ret = run_clang_format_diff(args, file) + return ret + except DiffError: + raise + except Exception as e: + raise UnexpectedError('{}: {}: {}'.format(file, e.__class__.__name__, + e), e) + + +def run_clang_format_diff(args, file): + try: + with io.open(file, 'r', encoding='utf-8') as f: + original = f.readlines() + except IOError as exc: + raise DiffError(str(exc)) + + if args.in_place: + invocation = [args.clang_format_executable, '-i', file] + else: + invocation = [args.clang_format_executable, file] + + if args.style: + invocation.extend(['--style', args.style]) + + if args.dry_run: + print(" ".join(invocation)) + return [], [] + + # Use of utf-8 to decode the process output. + # + # Hopefully, this is the correct thing to do. + # + # It's done due to the following assumptions (which may be incorrect): + # - clang-format will returns the bytes read from the files as-is, + # without conversion, and it is already assumed that the files use utf-8. + # - if the diagnostics were internationalized, they would use utf-8: + # > Adding Translations to Clang + # > + # > Not possible yet! + # > Diagnostic strings should be written in UTF-8, + # > the client can translate to the relevant code page if needed. + # > Each translation completely replaces the format string + # > for the diagnostic. + # > -- http://clang.llvm.org/docs/InternalsManual.html#internals-diag-translation + # + # It's not pretty, due to Python 2 & 3 compatibility. + encoding_py3 = {} + if sys.version_info[0] >= 3: + encoding_py3['encoding'] = 'utf-8' + + try: + proc = subprocess.Popen( + invocation, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + universal_newlines=True, + **encoding_py3) + except OSError as exc: + raise DiffError( + "Command '{}' failed to start: {}".format( + subprocess.list2cmdline(invocation), exc + ) + ) + proc_stdout = proc.stdout + proc_stderr = proc.stderr + if sys.version_info[0] < 3: + # make the pipes compatible with Python 3, + # reading lines should output unicode + encoding = 'utf-8' + proc_stdout = codecs.getreader(encoding)(proc_stdout) + proc_stderr = codecs.getreader(encoding)(proc_stderr) + # hopefully the stderr pipe won't get full and block the process + outs = list(proc_stdout.readlines()) + errs = list(proc_stderr.readlines()) + proc.wait() + if proc.returncode: + raise DiffError( + "Command '{}' returned non-zero exit status {}".format( + subprocess.list2cmdline(invocation), proc.returncode + ), + errs, + ) + if args.in_place: + return [], errs + return make_diff(file, original, outs), errs + + +def bold_red(s): + return '\x1b[1m\x1b[31m' + s + '\x1b[0m' + + +def colorize(diff_lines): + def bold(s): + return '\x1b[1m' + s + '\x1b[0m' + + def cyan(s): + return '\x1b[36m' + s + '\x1b[0m' + + def green(s): + return '\x1b[32m' + s + '\x1b[0m' + + def red(s): + return '\x1b[31m' + s + '\x1b[0m' + + for line in diff_lines: + if line[:4] in ['--- ', '+++ ']: + yield bold(line) + elif line.startswith('@@ '): + yield cyan(line) + elif line.startswith('+'): + yield green(line) + elif line.startswith('-'): + yield red(line) + else: + yield line + + +def print_diff(diff_lines, use_color): + if use_color: + diff_lines = colorize(diff_lines) + if sys.version_info[0] < 3: + sys.stdout.writelines((l.encode('utf-8') for l in diff_lines)) + else: + sys.stdout.writelines(diff_lines) + + +def print_trouble(prog, message, use_colors): + error_text = 'error:' + if use_colors: + error_text = bold_red(error_text) + print("{}: {} {}".format(prog, error_text, message), file=sys.stderr) + + +def main(): + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + '--clang-format-executable', + metavar='EXECUTABLE', + help='path to the clang-format executable', + default='clang-format') + parser.add_argument( + '--extensions', + help='comma separated list of file extensions (default: {})'.format( + DEFAULT_EXTENSIONS), + default=DEFAULT_EXTENSIONS) + parser.add_argument( + '-r', + '--recursive', + action='store_true', + help='run recursively over directories') + parser.add_argument( + '-d', + '--dry-run', + action='store_true', + help='just print the list of files') + parser.add_argument( + '-i', + '--in-place', + action='store_true', + help='format file instead of printing differences') + parser.add_argument('files', metavar='file', nargs='+') + parser.add_argument( + '-q', + '--quiet', + action='store_true', + help="disable output, useful for the exit code") + parser.add_argument( + '-j', + metavar='N', + type=int, + default=0, + help='run N clang-format jobs in parallel' + ' (default number of cpus + 1)') + parser.add_argument( + '--color', + default='auto', + choices=['auto', 'always', 'never'], + help='show colored diff (default: auto)') + parser.add_argument( + '-e', + '--exclude', + metavar='PATTERN', + action='append', + default=[], + help='exclude paths matching the given glob-like pattern(s)' + ' from recursive search') + parser.add_argument( + '--style', + help='formatting style to apply (LLVM, Google, Chromium, Mozilla, WebKit)') + + args = parser.parse_args() + + # use default signal handling, like diff return SIGINT value on ^C + # https://bugs.python.org/issue14229#msg156446 + signal.signal(signal.SIGINT, signal.SIG_DFL) + try: + signal.SIGPIPE + except AttributeError: + # compatibility, SIGPIPE does not exist on Windows + pass + else: + signal.signal(signal.SIGPIPE, signal.SIG_DFL) + + colored_stdout = False + colored_stderr = False + if args.color == 'always': + colored_stdout = True + colored_stderr = True + elif args.color == 'auto': + colored_stdout = sys.stdout.isatty() + colored_stderr = sys.stderr.isatty() + + version_invocation = [args.clang_format_executable, str("--version")] + try: + subprocess.check_call(version_invocation, stdout=DEVNULL) + except subprocess.CalledProcessError as e: + print_trouble(parser.prog, str(e), use_colors=colored_stderr) + return ExitStatus.TROUBLE + except OSError as e: + print_trouble( + parser.prog, + "Command '{}' failed to start: {}".format( + subprocess.list2cmdline(version_invocation), e + ), + use_colors=colored_stderr, + ) + return ExitStatus.TROUBLE + + retcode = ExitStatus.SUCCESS + + excludes = excludes_from_file(DEFAULT_CLANG_FORMAT_IGNORE) + excludes.extend(args.exclude) + + files = list_files( + args.files, + recursive=args.recursive, + exclude=excludes, + extensions=args.extensions.split(',')) + + if not files: + return + + njobs = args.j + if njobs == 0: + njobs = multiprocessing.cpu_count() + 1 + njobs = min(len(files), njobs) + + if njobs == 1: + # execute directly instead of in a pool, + # less overhead, simpler stacktraces + it = (run_clang_format_diff_wrapper(args, file) for file in files) + pool = None + else: + pool = multiprocessing.Pool(njobs) + it = pool.imap_unordered( + partial(run_clang_format_diff_wrapper, args), files) + pool.close() + while True: + try: + outs, errs = next(it) + except StopIteration: + break + except DiffError as e: + print_trouble(parser.prog, str(e), use_colors=colored_stderr) + retcode = ExitStatus.TROUBLE + sys.stderr.writelines(e.errs) + except UnexpectedError as e: + print_trouble(parser.prog, str(e), use_colors=colored_stderr) + sys.stderr.write(e.formatted_traceback) + retcode = ExitStatus.TROUBLE + # stop at the first unexpected error, + # something could be very wrong, + # don't process all files unnecessarily + if pool: + pool.terminate() + break + else: + sys.stderr.writelines(errs) + if outs == []: + continue + if not args.quiet: + print_diff(outs, use_color=colored_stdout) + if retcode == ExitStatus.SUCCESS: + retcode = ExitStatus.DIFF + if pool: + pool.join() + return retcode + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt new file mode 100644 index 0000000..82f94bb --- /dev/null +++ b/src/CMakeLists.txt @@ -0,0 +1,6 @@ +add_library(ALP SHARED + falp.cpp + fastlanes_generated_unffor.cpp + fastlanes_generated_ffor.cpp + fastlanes_ffor.cpp + fastlanes_unffor.cpp) diff --git a/src/falp.cpp b/src/falp.cpp new file mode 100644 index 0000000..f1cfd88 --- /dev/null +++ b/src/falp.cpp @@ -0,0 +1,33955 @@ +#include "alp/falp.hpp" +#include "alp/constants.hpp" + +namespace generated { namespace falp::fallback { namespace scalar { +static void falp_0bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + *(out + (i * 1) + (0 * 16) + (16 * 0)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = base_0; + } +} +static void falp_1bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_2bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 = (register_0) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_3bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_4bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 = (register_0) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 = (register_0) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 = (register_0) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_5bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_6bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 = (register_0) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_7bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_8bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 = (register_0) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 = (register_0) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 = (register_0) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 = (register_0) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 = (register_0) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 = (register_0) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 = (register_0) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_9bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_10bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 = (register_0) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_11bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_12bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 = (register_0) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 = (register_0) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 = (register_0) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_13bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_14bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 = (register_0) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_15bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 13; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_16bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_17bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 13; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 15; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_18bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 = (register_0) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_19bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 13; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 15; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 17; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_20bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 = (register_0) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 = (register_0) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 = (register_0) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_21bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 13; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 15; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 17; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 19; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_22bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 = (register_0) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_23bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 13; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 21; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 19; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 17; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 15; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_24bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 = (register_0) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 = (register_0) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 = (register_0) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 = (register_0) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 = (register_0) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 = (register_0) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 = (register_0) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_25bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 17; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 23; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 15; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 21; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 13; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 19; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_26bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 = (register_0) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_27bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 13; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 23; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 19; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 15; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 25; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 21; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 17; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_28bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 = (register_0) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 = (register_0) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 = (register_0) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_29bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 13; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 19; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 25; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 15; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 21; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 27; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 17; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 23; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 29) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_30bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 = (register_0) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_31bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 29) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 13; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 15; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 17; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 19; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 21; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 23; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 25; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 27; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 29) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 29; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 31) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_32bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_33bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 31) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 31; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 29) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 29; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 27; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 25; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 23; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 21; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 19; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 17; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 15; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 13; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 29) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 31) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 33) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_34bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 = (register_0) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_35bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 29) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 29; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 23; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 17; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 31) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 33) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 33; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 27; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 21; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 15; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 33) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 31) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 31; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 25; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 19; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 13; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 29) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 35) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_36bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 = (register_0) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 = (register_0) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 = (register_0) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_37bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 27; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 17; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 33) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 31) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 31; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 21; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 29) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 35) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 35; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 25; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 15; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 35) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 29) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 29; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 19; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 31) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 33) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 33; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 23; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 13; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 37) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_38bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 = (register_0) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_39bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 25; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 31) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 33) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 33; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 19; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 37) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 27; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 13; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 38; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 29) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 35) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 35; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 21; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 35) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 29) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 29; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 15; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 37) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 37; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 23; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 33) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 31) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 31; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 17; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 39) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_40bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 = (register_0) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 = (register_0) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 = (register_0) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 = (register_0) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 = (register_0) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 = (register_0) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 = (register_0) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_41bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 41) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 23; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 41) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 41) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 31) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 33) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 33; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 41) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 15; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 38; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 41) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 41) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 39) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 25; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 41) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 41) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 29) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 35) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 35; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 41) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 17; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 41) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 41) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 37) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 27; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 41) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 41) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 37) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 37; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 41) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 19; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 41) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 41) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 35) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 29) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 29; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 41) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 41) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 39) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 39; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 41) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 21; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 41) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 41) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 33) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 31) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 31; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 41) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 13; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 41) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 41) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_42bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 42) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 42) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 42) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 42) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 42) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 42) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 42) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 42) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 42) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 38; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 42) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 42) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 = (register_0) & ((1ULL << 42) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 42) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 42) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 42) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 42) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 42) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 42) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 42) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 42) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 38; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 42) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 42) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_43bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 43) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 21; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 42; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 43) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 41) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 41; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 43) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 19; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 43) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 39) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 39; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 43) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 17; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 38; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 43) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 37) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 37; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 43) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 15; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 43) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 29) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 35) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 35; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 43) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 13; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 43) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 31) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 33) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 33; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 43) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 43) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 33) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 31) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 31; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 43) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 43) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 35) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 29) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 29; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 43) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 43) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 37) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 27; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 43) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 43) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 39) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 25; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 43) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 43) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 41) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 23; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 43) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 43) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_44bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 44) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 44) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 44) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 44) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 44) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 = (register_0) & ((1ULL << 44) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 44) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 44) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 44) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 44) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 = (register_0) & ((1ULL << 44) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 44) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 44) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 44) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 44) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 = (register_0) & ((1ULL << 44) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 44) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 44) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 44) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 44) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_45bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 45) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 19; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 38; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 45) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 33) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 31) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 31; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 45) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 43) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 43; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 45) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 17; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 45) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 35) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 29) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 29; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 45) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 41) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 41; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 45) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 15; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 45) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 37) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 27; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 45) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 39) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 39; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 45) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 13; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 45) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 39) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 25; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 45) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 37) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 37; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 45) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 45) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 41) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 23; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 42; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 45) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 29) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 35) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 35; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 45) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 45) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 43) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 21; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 45) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 31) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 33) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 33; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 45) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 45) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_46bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 46) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 46) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 46) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 46) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 42; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 46) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 46) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 46) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 46) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 38; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 46) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 = (register_0) & ((1ULL << 46) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 46) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 46) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 46) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 42; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 46) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 46) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 46) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 46) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 38; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 46) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 720); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_47bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 47) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 17; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 47) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 43) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 21; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 38; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 47) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 39) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 25; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 42; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 47) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 35) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 29) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 29; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 46; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 47) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 31) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 33) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 33; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 47) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 37) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 37; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 47) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 41) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 41; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 47) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 45) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 45; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 47) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 15; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 47) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 45) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 19; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 47) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 41) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 23; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 47) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 37) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 27; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 47) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 33) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 31) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 31; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 47) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 46) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 29) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 35) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 35; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 47) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 39) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 39; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 47) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 43) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 43; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 47) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 720); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 13; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 47) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_48bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 = (register_0) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 = (register_0) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 = (register_0) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 = (register_0) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 = (register_0) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 = (register_0) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 = (register_0) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 = (register_0) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 = (register_0) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 = (register_0) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 = (register_0) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 = (register_0) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 = (register_0) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 = (register_0) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 720); + tmp_0 = (register_0) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 752); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_49bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 49) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 15; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 45) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 45; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 49) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 41) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 41; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 49) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 37) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 37; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 49) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 46) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 31) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 33) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 33; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 49) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 35) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 29) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 29; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 49) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 39) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 25; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 49) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 43) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 21; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 49) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 47) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 17; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 47) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 47; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 49) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 13; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 43) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 43; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 49) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 39) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 39; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 49) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 29) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 35) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 35; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 49) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 33) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 31) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 31; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 46; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 49) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 37) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 27; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 42; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 49) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 41) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 23; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 720); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 38; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 49) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 45) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 752); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 19; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 768); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 49) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_50bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 50) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 42; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 50) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 50) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 50) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 46) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 46; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 50) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 38; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 50) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 50) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 50) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 = (register_0) & ((1ULL << 50) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 42; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 50) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 50) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 50) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 46) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 46; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 50) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 38; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 50) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 720); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 50) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 752); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 768); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 784); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 50) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_51bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 51) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 13; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 39) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 39; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 51) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 50) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 37) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 27; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 51) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 49) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 15; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 41) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 41; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 51) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 35) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 29) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 29; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 42; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 51) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 47) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 17; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 43) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 43; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 51) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 46) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 33) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 31) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 31; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 51) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 45) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 19; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 45) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 45; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 51) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 31) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 33) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 33; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 46; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 51) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 43) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 21; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 47) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 47; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 51) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 29) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 35) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 35; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 51) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 41) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 23; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 49) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 49; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 51) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 720); + tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 37) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 37; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 50) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 752); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 50; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 51) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 768); + tmp_0 |= ((register_0) & ((1ULL << 39) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 784); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 25; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 800); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 38; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 51) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_52bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 52) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 52) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 52) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 = (register_0) & ((1ULL << 52) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 52) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 52) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 = (register_0) & ((1ULL << 52) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 52) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 52) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 = (register_0) & ((1ULL << 52) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 52) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 720); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 752); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 52) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 768); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 784); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 800); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 816); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_53bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 53) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 31) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 33) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 33; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 53) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 51) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 13; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 29) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 35) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 35; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 46; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 53) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 49) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 15; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 37) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 37; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 53) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 47) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 17; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 39) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 39; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 50) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 50; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 53) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 45) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 19; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 41) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 41; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 52; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 53) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 43) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 21; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 43) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 43; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 53) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 52) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 41) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 23; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 45) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 45; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 53) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 50) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 39) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 25; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 47) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 47; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 53) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 37) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 27; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 38; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 49) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 49; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 53) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 46) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 720); + tmp_0 |= ((register_0) & ((1ULL << 35) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 29) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 29; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 752); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 51) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 768); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 51; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 53) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 784); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 800); + tmp_0 |= ((register_0) & ((1ULL << 33) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 31) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 816); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 31; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 832); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 42; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 53) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_54bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 54) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 50) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 50; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 54) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 46; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 54) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 52) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 42; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 52; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 54) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 46) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 38; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 54) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 50) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 54) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 = (register_0) & ((1ULL << 54) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 50) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 50; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 54) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 46; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 54) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 52) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 42; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 52; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 54) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 46) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 720); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 752); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 38; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 768); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 54) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 784); + tmp_0 |= ((register_0) & ((1ULL << 50) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 800); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 816); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 832); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 848); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 54) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_55bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 55) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 46) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 37) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 27; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 45) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 45; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 54) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 54; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 55) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 47) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 17; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 29) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 35) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 35; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 53) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 53; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 55) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 39) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 25; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 43) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 43; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 52; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 55) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 49) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 15; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 31) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 33) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 33; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 42; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 51) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 51; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 55) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 50) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 41) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 23; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 41) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 41; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 50) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 50; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 55) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 51) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 13; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 33) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 31) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 31; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 49) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 49; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 55) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 52) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 43) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 21; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 39) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 39; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 55) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 53) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 720); + tmp_0 |= ((register_0) & ((1ULL << 35) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 29) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 29; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 752); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 38; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 47) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 768); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 47; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 55) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 784); + tmp_0 |= ((register_0) & ((1ULL << 54) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 800); + tmp_0 |= ((register_0) & ((1ULL << 45) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 816); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 19; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 832); + tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 37) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 848); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 37; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 864); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 46; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 55) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_56bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 56) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 = (register_0) & ((1ULL << 56) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 = (register_0) & ((1ULL << 56) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 = (register_0) & ((1ULL << 56) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 = (register_0) & ((1ULL << 56) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 = (register_0) & ((1ULL << 56) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 = (register_0) & ((1ULL << 56) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 720); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 752); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 768); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 784); + tmp_0 = (register_0) & ((1ULL << 56) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 800); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 816); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 832); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 848); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 864); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 880); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_57bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 57) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 50) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 43) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 21; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 29) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 35) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 35; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 42; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 49) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 49; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 56; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 57) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 51) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 13; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 37) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 27; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 41) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 41; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 55) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 55; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 57) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 52) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 45) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 19; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 31) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 33) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 33; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 47) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 47; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 54) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 54; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 57) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 53) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 46) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 39) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 25; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 39) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 39; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 46; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 53) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 53; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 57) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 54) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 47) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 17; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 33) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 31) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 31; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 38; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 45) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 45; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 52; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 57) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 55) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 41) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 23; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 720); + tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 37) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 37; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 752); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 51) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 768); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 51; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 57) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 784); + tmp_0 |= ((register_0) & ((1ULL << 56) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 800); + tmp_0 |= ((register_0) & ((1ULL << 49) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 816); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 15; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 832); + tmp_0 |= ((register_0) & ((1ULL << 35) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 29) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 848); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 29; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 864); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 43) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 880); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 43; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 50) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 896); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 50; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 57) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_58bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 58) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 52) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 46) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 42; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 54) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 54; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 58) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 56) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 50) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 38; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 50) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 50; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 56; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 58) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 54) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 46; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 52; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 58) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 = (register_0) & ((1ULL << 58) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 52) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 46) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 42; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 54) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 54; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 58) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 56) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 50) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 720); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 38; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 50) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 752); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 50; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 768); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 56; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 58) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 784); + tmp_0 |= ((register_0) & ((1ULL << 54) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 800); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 816); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 832); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 848); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 864); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 880); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 896); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 46; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 912); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 52; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 58) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_59bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 59) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 54) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 49) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 15; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 39) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 25; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 29) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 35) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 35; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 45) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 45; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 50) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 50; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 55) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 55; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 59) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 58) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 53) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 43) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 21; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 33) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 31) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 31; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 41) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 41; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 46; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 51) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 51; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 56; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 59) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 57) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 52) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 47) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 17; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 37) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 27; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 37) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 37; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 42; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 47) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 47; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 52; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 57) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 57; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 59) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 56) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 51) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 46) - 1)) << 13; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 41) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 23; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 31) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 33) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 33; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 38; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 43) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 43; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 720); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 53) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 53; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 58) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 752); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 58; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 59) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 768); + tmp_0 |= ((register_0) & ((1ULL << 55) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 784); + tmp_0 |= ((register_0) & ((1ULL << 50) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 800); + tmp_0 |= ((register_0) & ((1ULL << 45) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 816); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 19; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 832); + tmp_0 |= ((register_0) & ((1ULL << 35) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 29) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 848); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 29; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 864); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 39) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 880); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 39; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 896); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 49) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 912); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 49; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 54) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 928); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 54; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 59) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_60bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 60) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 56) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 52) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 52; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 56; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 60) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 = (register_0) & ((1ULL << 60) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 56) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 52) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 52; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 56; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 60) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 = (register_0) & ((1ULL << 60) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 56) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 52) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 52; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 56; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 60) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 720); + tmp_0 = (register_0) & ((1ULL << 60) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 56) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 752); + tmp_0 |= ((register_0) & ((1ULL << 52) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 768); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 784); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 800); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 816); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 832); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 848); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 864); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 880); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 896); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 912); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 928); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 52; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 944); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 56; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 60) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_61bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 61) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 58) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 55) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 52) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 49) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 46) - 1)) << 15; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 43) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 21; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 37) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 27; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 31) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 33) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 33; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 39) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 39; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 42; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 45) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 45; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 51) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 51; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 54) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 54; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 57) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 57; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 60) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 60; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 61) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 59) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 56) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 53) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 50) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 47) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 17; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 41) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 23; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 35) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 29) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 29; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 29) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 35) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 35; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 38; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 41) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 41; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 47) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 47; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 50) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 50; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 53) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 53; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 56; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 59) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 59; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 61) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 60) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 57) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 54) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 51) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 720); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 13; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 45) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 752); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 19; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 768); + tmp_0 |= ((register_0) & ((1ULL << 39) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 784); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 25; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 800); + tmp_0 |= ((register_0) & ((1ULL << 33) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 31) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 816); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 31; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 832); + tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 37) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 848); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 37; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 864); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 43) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 880); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 43; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 896); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 46; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 49) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 912); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 49; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 928); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 52; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 55) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 944); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 55; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 58) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 960); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 58; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 61) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_62bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 62) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 60) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 58) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 56) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 54) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 52) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 50) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 46) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 38; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 42; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 46; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 50) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 50; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 52; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 54) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 54; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 56; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 58) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 58; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 60) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 60; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 62) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 = (register_0) & ((1ULL << 62) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 60) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 58) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 56) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 54) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 52) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 50) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 46) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 720); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 752); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 768); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 784); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 800); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 38; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 816); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 832); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 42; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 848); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 864); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 46; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 880); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 50) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 896); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 50; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 912); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 52; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 54) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 928); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 54; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 944); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 56; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 58) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 960); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 58; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 60) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 976); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 60; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 62) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_63bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 63) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_dbl; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 62) - 1)) << 1; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_dbl; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 61) - 1)) << 2; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_dbl; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 60) - 1)) << 3; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_dbl; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 59) - 1)) << 4; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_dbl; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 58) - 1)) << 5; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_dbl; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 57) - 1)) << 6; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_dbl; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 56) - 1)) << 7; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_dbl; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 55) - 1)) << 8; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_dbl; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 54) - 1)) << 9; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_dbl; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 53) - 1)) << 10; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_dbl; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 52) - 1)) << 11; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_dbl; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 51) - 1)) << 12; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_dbl; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 50) - 1)) << 13; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_dbl; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 49) - 1)) << 14; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_dbl; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 15; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_dbl; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 47) - 1)) << 16; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_dbl; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 46) - 1)) << 17; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_dbl; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 45) - 1)) << 18; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_dbl; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 19; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_dbl; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 43) - 1)) << 20; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_dbl; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 21; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_dbl; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 41) - 1)) << 22; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_dbl; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 23; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_dbl; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 39) - 1)) << 24; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_dbl; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 25; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_dbl; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 37) - 1)) << 26; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_dbl; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 27; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_dbl; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 35) - 1)) << 28; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_dbl; + tmp_0 = (register_0 >> 35) & ((1ULL << 29) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 29; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_dbl; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 33) - 1)) << 30; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_dbl; + tmp_0 = (register_0 >> 33) & ((1ULL << 31) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 31; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_dbl; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 31) - 1)) << 32; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_dbl; + tmp_0 = (register_0 >> 31) & ((1ULL << 33) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 33; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_dbl; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 29) - 1)) << 34; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_dbl; + tmp_0 = (register_0 >> 29) & ((1ULL << 35) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 35; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_dbl; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 36; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_dbl; + tmp_0 = (register_0 >> 27) & ((1ULL << 37) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 37; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_dbl; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 38; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_dbl; + tmp_0 = (register_0 >> 25) & ((1ULL << 39) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 39; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_dbl; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 40; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_dbl; + tmp_0 = (register_0 >> 23) & ((1ULL << 41) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 41; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_dbl; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 42; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_dbl; + tmp_0 = (register_0 >> 21) & ((1ULL << 43) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 43; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_dbl; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 44; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_dbl; + tmp_0 = (register_0 >> 19) & ((1ULL << 45) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 720); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 45; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_dbl; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 46; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_dbl; + tmp_0 = (register_0 >> 17) & ((1ULL << 47) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 752); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 47; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_dbl; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 768); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 48; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_dbl; + tmp_0 = (register_0 >> 15) & ((1ULL << 49) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 784); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 49; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_dbl; + tmp_0 = (register_0 >> 14) & ((1ULL << 50) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 800); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 50; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_dbl; + tmp_0 = (register_0 >> 13) & ((1ULL << 51) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 816); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 51; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_dbl; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 832); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 52; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_dbl; + tmp_0 = (register_0 >> 11) & ((1ULL << 53) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 848); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 53; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_dbl; + tmp_0 = (register_0 >> 10) & ((1ULL << 54) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 864); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 54; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_dbl; + tmp_0 = (register_0 >> 9) & ((1ULL << 55) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 880); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 55; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_dbl; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 896); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 56; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_dbl; + tmp_0 = (register_0 >> 7) & ((1ULL << 57) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 912); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 57; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_dbl; + tmp_0 = (register_0 >> 6) & ((1ULL << 58) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 928); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 58; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_dbl; + tmp_0 = (register_0 >> 5) & ((1ULL << 59) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 944); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 59; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_dbl; + tmp_0 = (register_0 >> 4) & ((1ULL << 60) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 960); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 60; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_dbl; + tmp_0 = (register_0 >> 3) & ((1ULL << 61) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 976); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 61; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_dbl; + tmp_0 = (register_0 >> 2) & ((1ULL << 62) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 992); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 62; + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_dbl; + tmp_0 = (register_0 >> 1) & ((1ULL << 63) - 1); + tmp_0 += base_0; + tmp_0 *= factor; + tmp_int = tmp_0; + tmp_dbl = tmp_int; + tmp_dbl *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_dbl; + } +} +static void falp_64bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + [[maybe_unused]] int64_t factor = alp::FACT_ARR[fac]; + [[maybe_unused]] double frac10 = alp::Constants::FRAC_ARR[exp]; + [[maybe_unused]] double tmp_dbl; + [[maybe_unused]] int64_t tmp_int; +#pragma clang loop vectorize(enable) + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 16); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 32); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 48); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 64); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 80); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 96); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 112); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 128); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 144); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 160); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 176); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 192); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 208); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 224); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 240); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 256); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 272); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 288); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 304); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 320); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 336); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 352); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 368); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 384); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 400); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 416); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 432); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 448); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 464); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 480); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 496); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 512); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 528); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 544); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 560); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 576); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 592); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 608); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 624); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 640); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 656); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 672); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 688); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 704); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 720); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 736); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 752); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 768); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 784); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 800); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 816); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 832); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 848); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 864); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 880); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 896); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 912); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 928); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 944); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 960); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 976); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 992); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 1008); + register_0 += base_0; + register_0 *= factor; + tmp_int = register_0; + tmp_dbl = tmp_int; + register_0 *= frac10; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = register_0; + } +} +void falp(const uint64_t* __restrict a_in_p, + double* __restrict a_out_p, + uint8_t bw, + const uint64_t* __restrict a_base_p, + uint8_t fac, + uint8_t exp) { + switch (bw) { + case 0: + falp_0bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 1: + falp_1bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 2: + falp_2bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 3: + falp_3bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 4: + falp_4bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 5: + falp_5bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 6: + falp_6bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 7: + falp_7bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 8: + falp_8bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 9: + falp_9bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 10: + falp_10bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 11: + falp_11bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 12: + falp_12bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 13: + falp_13bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 14: + falp_14bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 15: + falp_15bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 16: + falp_16bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 17: + falp_17bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 18: + falp_18bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 19: + falp_19bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 20: + falp_20bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 21: + falp_21bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 22: + falp_22bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 23: + falp_23bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 24: + falp_24bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 25: + falp_25bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 26: + falp_26bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 27: + falp_27bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 28: + falp_28bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 29: + falp_29bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 30: + falp_30bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 31: + falp_31bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 32: + falp_32bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 33: + falp_33bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 34: + falp_34bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 35: + falp_35bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 36: + falp_36bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 37: + falp_37bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 38: + falp_38bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 39: + falp_39bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 40: + falp_40bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 41: + falp_41bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 42: + falp_42bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 43: + falp_43bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 44: + falp_44bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 45: + falp_45bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 46: + falp_46bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 47: + falp_47bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 48: + falp_48bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 49: + falp_49bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 50: + falp_50bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 51: + falp_51bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 52: + falp_52bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 53: + falp_53bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 54: + falp_54bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 55: + falp_55bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 56: + falp_56bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 57: + falp_57bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 58: + falp_58bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 59: + falp_59bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 60: + falp_60bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 61: + falp_61bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 62: + falp_62bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 63: + falp_63bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + case 64: + falp_64bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p, fac, exp); + break; + } +} +}}}; // namespace generated::falp::fallback::scalar diff --git a/src/fastlanes_ffor.cpp b/src/fastlanes_ffor.cpp new file mode 100644 index 0000000..d54f368 --- /dev/null +++ b/src/fastlanes_ffor.cpp @@ -0,0 +1,36 @@ +#include "fastlanes/ffor.hpp" + +namespace fastlanes::generated::ffor::fallback::scalar { + +void ffor(const int64_t* __restrict in, int64_t* __restrict out, uint8_t bw, const int64_t* __restrict a_base_p) { + auto const* in_u = reinterpret_cast(in); + auto* out_u = reinterpret_cast(out); + auto const* base_u = reinterpret_cast(a_base_p); + + ffor(in_u, out_u, bw, base_u); +} + +void ffor(const int32_t* __restrict in, int32_t* __restrict out, uint8_t bw, const int32_t* __restrict a_base_p) { + auto const* in_u = reinterpret_cast(in); + auto* out_u = reinterpret_cast(out); + auto const* base_u = reinterpret_cast(a_base_p); + + ffor(in_u, out_u, bw, base_u); +} + +void ffor(const int16_t* __restrict in, int16_t* __restrict out, uint8_t bw, const int16_t* __restrict a_base_p) { + auto const* in_u = reinterpret_cast(in); + auto* out_u = reinterpret_cast(out); + auto const* base_u = reinterpret_cast(a_base_p); + + ffor(in_u, out_u, bw, base_u); +} + +void ffor(const int8_t* __restrict in, int8_t* __restrict out, uint8_t bw, const int8_t* __restrict a_base_p) { + auto const* in_u = reinterpret_cast(in); + auto* out_u = reinterpret_cast(out); + auto const* base_u = reinterpret_cast(a_base_p); + + ffor(in_u, out_u, bw, base_u); +} +} // namespace fastlanes::generated::ffor::fallback::scalar \ No newline at end of file diff --git a/src/fastlanes_generated_ffor.cpp b/src/fastlanes_generated_ffor.cpp new file mode 100644 index 0000000..3f4be4d --- /dev/null +++ b/src/fastlanes_generated_ffor.cpp @@ -0,0 +1,30138 @@ +#include "fastlanes/ffor.hpp" +#include "fastlanes/macros.hpp" +namespace fastlanes { namespace generated { namespace ffor::fallback { namespace scalar { +void static ffor_0bit_8ow(const uint8_t* __restrict in, uint8_t* __restrict out, const uint8_t* __restrict a_base_p) {} +void static ffor_1bit_8ow(const uint8_t* __restrict in, uint8_t* __restrict out, const uint8_t* __restrict a_base_p) { + uint8_t tmp = 0U; + uint8_t src; + for (int i = 0; i < 128; i++) { + src = *(in + 128 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp = src; + src = *(in + 128 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 1U; + src = *(in + 128 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 2U; + src = *(in + 128 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 3U; + src = *(in + 128 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 4U; + src = *(in + 128 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 5U; + src = *(in + 128 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 6U; + src = *(in + 128 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 7U; + *(out + i) = tmp; + out -= 0; + } +} +void static ffor_2bit_8ow(const uint8_t* __restrict in, uint8_t* __restrict out, const uint8_t* __restrict a_base_p) { + uint8_t tmp = 0U; + uint8_t src; + for (int i = 0; i < 128; i++) { + src = *(in + 128 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp = src; + src = *(in + 128 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 2U; + src = *(in + 128 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 4U; + src = *(in + 128 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 6U; + *(out + i) = tmp; + out += 128; + src = *(in + 128 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp = src; + src = *(in + 128 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 2U; + src = *(in + 128 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 4U; + src = *(in + 128 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 6U; + *(out + i) = tmp; + out -= 128; + } +} +void static ffor_3bit_8ow(const uint8_t* __restrict in, uint8_t* __restrict out, const uint8_t* __restrict a_base_p) { + uint8_t tmp = 0U; + uint8_t src; + for (int i = 0; i < 128; i++) { + src = *(in + 128 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp = src; + src = *(in + 128 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 3U; + src = *(in + 128 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 6U; + *(out + i) = tmp; + out += 128; + src = *(in + 128 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp = src >> 2U; + src = *(in + 128 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 1U; + src = *(in + 128 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 4U; + src = *(in + 128 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 7U; + *(out + i) = tmp; + out += 128; + src = *(in + 128 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp = src >> 1U; + src = *(in + 128 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 2U; + src = *(in + 128 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 5U; + *(out + i) = tmp; + out -= 256; + } +} +void static ffor_4bit_8ow(const uint8_t* __restrict in, uint8_t* __restrict out, const uint8_t* __restrict a_base_p) { + uint8_t tmp = 0U; + uint8_t src; + for (int i = 0; i < 128; i++) { + src = *(in + 128 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp = src; + src = *(in + 128 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 4U; + *(out + i) = tmp; + out += 128; + src = *(in + 128 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp = src; + src = *(in + 128 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 4U; + *(out + i) = tmp; + out += 128; + src = *(in + 128 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp = src; + src = *(in + 128 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 4U; + *(out + i) = tmp; + out += 128; + src = *(in + 128 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp = src; + src = *(in + 128 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 4U; + *(out + i) = tmp; + out -= 384; + } +} +void static ffor_5bit_8ow(const uint8_t* __restrict in, uint8_t* __restrict out, const uint8_t* __restrict a_base_p) { + uint8_t tmp = 0U; + uint8_t src; + for (int i = 0; i < 128; i++) { + src = *(in + 128 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp = src; + src = *(in + 128 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 5U; + *(out + i) = tmp; + out += 128; + src = *(in + 128 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp = src >> 3U; + src = *(in + 128 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 2U; + src = *(in + 128 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 7U; + *(out + i) = tmp; + out += 128; + src = *(in + 128 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp = src >> 1U; + src = *(in + 128 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 4U; + *(out + i) = tmp; + out += 128; + src = *(in + 128 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp = src >> 4U; + src = *(in + 128 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 1U; + src = *(in + 128 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 6U; + *(out + i) = tmp; + out += 128; + src = *(in + 128 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp = src >> 2U; + src = *(in + 128 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 3U; + *(out + i) = tmp; + out -= 512; + } +} +void static ffor_6bit_8ow(const uint8_t* __restrict in, uint8_t* __restrict out, const uint8_t* __restrict a_base_p) { + uint8_t tmp = 0U; + uint8_t src; + for (int i = 0; i < 128; i++) { + src = *(in + 128 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp = src; + src = *(in + 128 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 6U; + *(out + i) = tmp; + out += 128; + src = *(in + 128 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp = src >> 2U; + src = *(in + 128 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 4U; + *(out + i) = tmp; + out += 128; + src = *(in + 128 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp = src >> 4U; + src = *(in + 128 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 2U; + *(out + i) = tmp; + out += 128; + src = *(in + 128 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp = src; + src = *(in + 128 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 6U; + *(out + i) = tmp; + out += 128; + src = *(in + 128 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp = src >> 2U; + src = *(in + 128 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 4U; + *(out + i) = tmp; + out += 128; + src = *(in + 128 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp = src >> 4U; + src = *(in + 128 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 2U; + *(out + i) = tmp; + out -= 640; + } +} +void static ffor_7bit_8ow(const uint8_t* __restrict in, uint8_t* __restrict out, const uint8_t* __restrict a_base_p) { + uint8_t tmp = 0U; + uint8_t src; + for (int i = 0; i < 128; i++) { + src = *(in + 128 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp = src; + src = *(in + 128 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 7U; + *(out + i) = tmp; + out += 128; + src = *(in + 128 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp = src >> 1U; + src = *(in + 128 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 6U; + *(out + i) = tmp; + out += 128; + src = *(in + 128 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp = src >> 2U; + src = *(in + 128 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 5U; + *(out + i) = tmp; + out += 128; + src = *(in + 128 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp = src >> 3U; + src = *(in + 128 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 4U; + *(out + i) = tmp; + out += 128; + src = *(in + 128 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp = src >> 4U; + src = *(in + 128 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 3U; + *(out + i) = tmp; + out += 128; + src = *(in + 128 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp = src >> 5U; + src = *(in + 128 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 2U; + *(out + i) = tmp; + out += 128; + src = *(in + 128 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp = src >> 6U; + src = *(in + 128 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 1U; + *(out + i) = tmp; + out -= 768; + } +} +void static ffor_8bit_8ow(const uint8_t* __restrict in, uint8_t* __restrict out, const uint8_t* __restrict a_base_p) { + uint8_t tmp = 0U; + uint8_t src; + for (int i = 0; i < 128; i++) { + src = *(in + 128 * 0 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 128; + src = *(in + 128 * 1 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 128; + src = *(in + 128 * 2 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 128; + src = *(in + 128 * 3 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 128; + src = *(in + 128 * 4 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 128; + src = *(in + 128 * 5 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 128; + src = *(in + 128 * 6 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 128; + src = *(in + 128 * 7 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out -= 896; + } +} +void static ffor_0bit_16ow(const uint16_t* __restrict in, + uint16_t* __restrict out, + const uint16_t* __restrict a_base_p) {} +void static ffor_1bit_16ow(const uint16_t* __restrict in, + uint16_t* __restrict out, + const uint16_t* __restrict a_base_p) { + uint16_t tmp = 0U; + uint16_t src; + for (int i = 0; i < 64; i++) { + src = *(in + 64 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp = src; + src = *(in + 64 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 1U; + src = *(in + 64 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 2U; + src = *(in + 64 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 3U; + src = *(in + 64 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 4U; + src = *(in + 64 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 5U; + src = *(in + 64 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 6U; + src = *(in + 64 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 7U; + src = *(in + 64 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 8U; + src = *(in + 64 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 9U; + src = *(in + 64 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 10U; + src = *(in + 64 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 11U; + src = *(in + 64 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 12U; + src = *(in + 64 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 13U; + src = *(in + 64 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 14U; + src = *(in + 64 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 15U; + *(out + i) = tmp; + out -= 0; + } +} +void static ffor_2bit_16ow(const uint16_t* __restrict in, + uint16_t* __restrict out, + const uint16_t* __restrict a_base_p) { + uint16_t tmp = 0U; + uint16_t src; + for (int i = 0; i < 64; i++) { + src = *(in + 64 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp = src; + src = *(in + 64 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 2U; + src = *(in + 64 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 4U; + src = *(in + 64 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 6U; + src = *(in + 64 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 8U; + src = *(in + 64 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 10U; + src = *(in + 64 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 12U; + src = *(in + 64 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 14U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp = src; + src = *(in + 64 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 2U; + src = *(in + 64 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 4U; + src = *(in + 64 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 6U; + src = *(in + 64 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 8U; + src = *(in + 64 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 10U; + src = *(in + 64 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 12U; + src = *(in + 64 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 14U; + *(out + i) = tmp; + out -= 64; + } +} +void static ffor_3bit_16ow(const uint16_t* __restrict in, + uint16_t* __restrict out, + const uint16_t* __restrict a_base_p) { + uint16_t tmp = 0U; + uint16_t src; + for (int i = 0; i < 64; i++) { + src = *(in + 64 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp = src; + src = *(in + 64 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 3U; + src = *(in + 64 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 6U; + src = *(in + 64 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 9U; + src = *(in + 64 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 12U; + src = *(in + 64 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 15U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp = src >> 1U; + src = *(in + 64 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 2U; + src = *(in + 64 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 5U; + src = *(in + 64 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 8U; + src = *(in + 64 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 11U; + src = *(in + 64 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 14U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp = src >> 2U; + src = *(in + 64 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 1U; + src = *(in + 64 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 4U; + src = *(in + 64 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 7U; + src = *(in + 64 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 10U; + src = *(in + 64 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 13U; + *(out + i) = tmp; + out -= 128; + } +} +void static ffor_4bit_16ow(const uint16_t* __restrict in, + uint16_t* __restrict out, + const uint16_t* __restrict a_base_p) { + uint16_t tmp = 0U; + uint16_t src; + for (int i = 0; i < 64; i++) { + src = *(in + 64 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp = src; + src = *(in + 64 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 4U; + src = *(in + 64 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 8U; + src = *(in + 64 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 12U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp = src; + src = *(in + 64 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 4U; + src = *(in + 64 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 8U; + src = *(in + 64 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 12U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp = src; + src = *(in + 64 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 4U; + src = *(in + 64 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 8U; + src = *(in + 64 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 12U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp = src; + src = *(in + 64 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 4U; + src = *(in + 64 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 8U; + src = *(in + 64 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 12U; + *(out + i) = tmp; + out -= 192; + } +} +void static ffor_5bit_16ow(const uint16_t* __restrict in, + uint16_t* __restrict out, + const uint16_t* __restrict a_base_p) { + uint16_t tmp = 0U; + uint16_t src; + for (int i = 0; i < 64; i++) { + src = *(in + 64 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp = src; + src = *(in + 64 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 5U; + src = *(in + 64 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 10U; + src = *(in + 64 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 15U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp = src >> 1U; + src = *(in + 64 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 4U; + src = *(in + 64 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 9U; + src = *(in + 64 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 14U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp = src >> 2U; + src = *(in + 64 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 3U; + src = *(in + 64 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 8U; + src = *(in + 64 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 13U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp = src >> 3U; + src = *(in + 64 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 2U; + src = *(in + 64 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 7U; + src = *(in + 64 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 12U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp = src >> 4U; + src = *(in + 64 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 1U; + src = *(in + 64 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 6U; + src = *(in + 64 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 11U; + *(out + i) = tmp; + out -= 256; + } +} +void static ffor_6bit_16ow(const uint16_t* __restrict in, + uint16_t* __restrict out, + const uint16_t* __restrict a_base_p) { + uint16_t tmp = 0U; + uint16_t src; + for (int i = 0; i < 64; i++) { + src = *(in + 64 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp = src; + src = *(in + 64 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 6U; + src = *(in + 64 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 12U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp = src >> 4U; + src = *(in + 64 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 2U; + src = *(in + 64 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 8U; + src = *(in + 64 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 14U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp = src >> 2U; + src = *(in + 64 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 4U; + src = *(in + 64 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 10U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp = src; + src = *(in + 64 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 6U; + src = *(in + 64 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 12U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp = src >> 4U; + src = *(in + 64 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 2U; + src = *(in + 64 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 8U; + src = *(in + 64 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 14U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp = src >> 2U; + src = *(in + 64 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 4U; + src = *(in + 64 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 10U; + *(out + i) = tmp; + out -= 320; + } +} +void static ffor_7bit_16ow(const uint16_t* __restrict in, + uint16_t* __restrict out, + const uint16_t* __restrict a_base_p) { + uint16_t tmp = 0U; + uint16_t src; + for (int i = 0; i < 64; i++) { + src = *(in + 64 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp = src; + src = *(in + 64 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 7U; + src = *(in + 64 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 14U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp = src >> 2U; + src = *(in + 64 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 5U; + src = *(in + 64 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 12U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp = src >> 4U; + src = *(in + 64 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 3U; + src = *(in + 64 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 10U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp = src >> 6U; + src = *(in + 64 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 1U; + src = *(in + 64 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 8U; + src = *(in + 64 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 15U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp = src >> 1U; + src = *(in + 64 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 6U; + src = *(in + 64 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 13U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp = src >> 3U; + src = *(in + 64 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 4U; + src = *(in + 64 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 11U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp = src >> 5U; + src = *(in + 64 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 2U; + src = *(in + 64 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 9U; + *(out + i) = tmp; + out -= 384; + } +} +void static ffor_8bit_16ow(const uint16_t* __restrict in, + uint16_t* __restrict out, + const uint16_t* __restrict a_base_p) { + uint16_t tmp = 0U; + uint16_t src; + for (int i = 0; i < 64; i++) { + src = *(in + 64 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp = src; + src = *(in + 64 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 8U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp = src; + src = *(in + 64 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 8U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp = src; + src = *(in + 64 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 8U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp = src; + src = *(in + 64 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 8U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp = src; + src = *(in + 64 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 8U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp = src; + src = *(in + 64 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 8U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp = src; + src = *(in + 64 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 8U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp = src; + src = *(in + 64 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 8U; + *(out + i) = tmp; + out -= 448; + } +} +void static ffor_9bit_16ow(const uint16_t* __restrict in, + uint16_t* __restrict out, + const uint16_t* __restrict a_base_p) { + uint16_t tmp = 0U; + uint16_t src; + for (int i = 0; i < 64; i++) { + src = *(in + 64 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp = src; + src = *(in + 64 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 9U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp = src >> 7U; + src = *(in + 64 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 2U; + src = *(in + 64 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 11U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp = src >> 5U; + src = *(in + 64 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 4U; + src = *(in + 64 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 13U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp = src >> 3U; + src = *(in + 64 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 6U; + src = *(in + 64 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 15U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp = src >> 1U; + src = *(in + 64 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 8U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp = src >> 8U; + src = *(in + 64 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 1U; + src = *(in + 64 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 10U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp = src >> 6U; + src = *(in + 64 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 3U; + src = *(in + 64 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 12U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp = src >> 4U; + src = *(in + 64 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 5U; + src = *(in + 64 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 14U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp = src >> 2U; + src = *(in + 64 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 7U; + *(out + i) = tmp; + out -= 512; + } +} +void static ffor_10bit_16ow(const uint16_t* __restrict in, + uint16_t* __restrict out, + const uint16_t* __restrict a_base_p) { + uint16_t tmp = 0U; + uint16_t src; + for (int i = 0; i < 64; i++) { + src = *(in + 64 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp = src; + src = *(in + 64 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 10U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp = src >> 6U; + src = *(in + 64 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 4U; + src = *(in + 64 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 14U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp = src >> 2U; + src = *(in + 64 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 8U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp = src >> 8U; + src = *(in + 64 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 2U; + src = *(in + 64 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 12U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp = src >> 4U; + src = *(in + 64 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 6U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp = src; + src = *(in + 64 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 10U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp = src >> 6U; + src = *(in + 64 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 4U; + src = *(in + 64 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 14U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp = src >> 2U; + src = *(in + 64 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 8U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp = src >> 8U; + src = *(in + 64 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 2U; + src = *(in + 64 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 12U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp = src >> 4U; + src = *(in + 64 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 6U; + *(out + i) = tmp; + out -= 576; + } +} +void static ffor_11bit_16ow(const uint16_t* __restrict in, + uint16_t* __restrict out, + const uint16_t* __restrict a_base_p) { + uint16_t tmp = 0U; + uint16_t src; + for (int i = 0; i < 64; i++) { + src = *(in + 64 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp = src; + src = *(in + 64 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 11U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp = src >> 5U; + src = *(in + 64 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 6U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp = src >> 10U; + src = *(in + 64 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 1U; + src = *(in + 64 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 12U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp = src >> 4U; + src = *(in + 64 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 7U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp = src >> 9U; + src = *(in + 64 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 2U; + src = *(in + 64 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 13U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp = src >> 3U; + src = *(in + 64 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 8U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp = src >> 8U; + src = *(in + 64 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 3U; + src = *(in + 64 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 14U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp = src >> 2U; + src = *(in + 64 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 9U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp = src >> 7U; + src = *(in + 64 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 4U; + src = *(in + 64 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 15U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp = src >> 1U; + src = *(in + 64 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 10U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp = src >> 6U; + src = *(in + 64 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 5U; + *(out + i) = tmp; + out -= 640; + } +} +void static ffor_12bit_16ow(const uint16_t* __restrict in, + uint16_t* __restrict out, + const uint16_t* __restrict a_base_p) { + uint16_t tmp = 0U; + uint16_t src; + for (int i = 0; i < 64; i++) { + src = *(in + 64 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp = src; + src = *(in + 64 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 12U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp = src >> 4U; + src = *(in + 64 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 8U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp = src >> 8U; + src = *(in + 64 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 4U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp = src; + src = *(in + 64 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 12U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp = src >> 4U; + src = *(in + 64 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 8U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp = src >> 8U; + src = *(in + 64 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 4U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp = src; + src = *(in + 64 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 12U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp = src >> 4U; + src = *(in + 64 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 8U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp = src >> 8U; + src = *(in + 64 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 4U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp = src; + src = *(in + 64 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 12U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp = src >> 4U; + src = *(in + 64 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 8U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp = src >> 8U; + src = *(in + 64 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 4U; + *(out + i) = tmp; + out -= 704; + } +} +void static ffor_13bit_16ow(const uint16_t* __restrict in, + uint16_t* __restrict out, + const uint16_t* __restrict a_base_p) { + uint16_t tmp = 0U; + uint16_t src; + for (int i = 0; i < 64; i++) { + src = *(in + 64 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp = src; + src = *(in + 64 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 13U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp = src >> 3U; + src = *(in + 64 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 10U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp = src >> 6U; + src = *(in + 64 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 7U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp = src >> 9U; + src = *(in + 64 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 4U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp = src >> 12U; + src = *(in + 64 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 1U; + src = *(in + 64 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 14U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp = src >> 2U; + src = *(in + 64 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 11U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp = src >> 5U; + src = *(in + 64 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 8U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp = src >> 8U; + src = *(in + 64 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 5U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp = src >> 11U; + src = *(in + 64 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 2U; + src = *(in + 64 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 15U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp = src >> 1U; + src = *(in + 64 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 12U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp = src >> 4U; + src = *(in + 64 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 9U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp = src >> 7U; + src = *(in + 64 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 6U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp = src >> 10U; + src = *(in + 64 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 3U; + *(out + i) = tmp; + out -= 768; + } +} +void static ffor_14bit_16ow(const uint16_t* __restrict in, + uint16_t* __restrict out, + const uint16_t* __restrict a_base_p) { + uint16_t tmp = 0U; + uint16_t src; + for (int i = 0; i < 64; i++) { + src = *(in + 64 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp = src; + src = *(in + 64 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 14U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp = src >> 2U; + src = *(in + 64 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 12U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp = src >> 4U; + src = *(in + 64 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 10U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp = src >> 6U; + src = *(in + 64 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 8U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp = src >> 8U; + src = *(in + 64 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 6U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp = src >> 10U; + src = *(in + 64 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 4U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp = src >> 12U; + src = *(in + 64 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 2U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp = src; + src = *(in + 64 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 14U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp = src >> 2U; + src = *(in + 64 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 12U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp = src >> 4U; + src = *(in + 64 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 10U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp = src >> 6U; + src = *(in + 64 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 8U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp = src >> 8U; + src = *(in + 64 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 6U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp = src >> 10U; + src = *(in + 64 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 4U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp = src >> 12U; + src = *(in + 64 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 2U; + *(out + i) = tmp; + out -= 832; + } +} +void static ffor_15bit_16ow(const uint16_t* __restrict in, + uint16_t* __restrict out, + const uint16_t* __restrict a_base_p) { + uint16_t tmp = 0U; + uint16_t src; + for (int i = 0; i < 64; i++) { + src = *(in + 64 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp = src; + src = *(in + 64 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 15U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp = src >> 1U; + src = *(in + 64 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 14U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp = src >> 2U; + src = *(in + 64 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 13U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp = src >> 3U; + src = *(in + 64 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 12U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp = src >> 4U; + src = *(in + 64 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 11U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp = src >> 5U; + src = *(in + 64 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 10U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp = src >> 6U; + src = *(in + 64 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 9U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp = src >> 7U; + src = *(in + 64 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 8U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp = src >> 8U; + src = *(in + 64 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 7U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp = src >> 9U; + src = *(in + 64 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 6U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp = src >> 10U; + src = *(in + 64 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 5U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp = src >> 11U; + src = *(in + 64 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 4U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp = src >> 12U; + src = *(in + 64 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 3U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp = src >> 13U; + src = *(in + 64 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 2U; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp = src >> 14U; + src = *(in + 64 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 1U; + *(out + i) = tmp; + out -= 896; + } +} +void static ffor_16bit_16ow(const uint16_t* __restrict in, + uint16_t* __restrict out, + const uint16_t* __restrict a_base_p) { + uint16_t tmp = 0U; + uint16_t src; + for (int i = 0; i < 64; i++) { + src = *(in + 64 * 0 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 1 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 2 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 3 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 4 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 5 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 6 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 7 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 8 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 9 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 10 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 11 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 12 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 13 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 14 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 64; + src = *(in + 64 * 15 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out -= 960; + } +} +void static ffor_0bit_32ow(const uint32_t* __restrict in, + uint32_t* __restrict out, + const uint32_t* __restrict a_base_p) {} +void static ffor_1bit_32ow(const uint32_t* __restrict in, + uint32_t* __restrict out, + const uint32_t* __restrict a_base_p) { + uint32_t tmp = 0U; + uint32_t src; + for (int i = 0; i < 32; i++) { + src = *(in + 32 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp = src; + src = *(in + 32 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 1U; + src = *(in + 32 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 2U; + src = *(in + 32 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 3U; + src = *(in + 32 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 4U; + src = *(in + 32 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 5U; + src = *(in + 32 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 6U; + src = *(in + 32 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 7U; + src = *(in + 32 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 8U; + src = *(in + 32 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 9U; + src = *(in + 32 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 10U; + src = *(in + 32 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 11U; + src = *(in + 32 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 12U; + src = *(in + 32 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 13U; + src = *(in + 32 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 14U; + src = *(in + 32 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 15U; + src = *(in + 32 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 16U; + src = *(in + 32 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 17U; + src = *(in + 32 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 18U; + src = *(in + 32 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 19U; + src = *(in + 32 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 20U; + src = *(in + 32 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 21U; + src = *(in + 32 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 22U; + src = *(in + 32 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 23U; + src = *(in + 32 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 24U; + src = *(in + 32 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 25U; + src = *(in + 32 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 26U; + src = *(in + 32 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 27U; + src = *(in + 32 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 28U; + src = *(in + 32 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 29U; + src = *(in + 32 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 30U; + src = *(in + 32 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 31U; + *(out + i) = tmp; + out -= 0; + } +} +void static ffor_2bit_32ow(const uint32_t* __restrict in, + uint32_t* __restrict out, + const uint32_t* __restrict a_base_p) { + uint32_t tmp = 0U; + uint32_t src; + for (int i = 0; i < 32; i++) { + src = *(in + 32 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp = src; + src = *(in + 32 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 2U; + src = *(in + 32 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 4U; + src = *(in + 32 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 6U; + src = *(in + 32 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 8U; + src = *(in + 32 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 10U; + src = *(in + 32 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 12U; + src = *(in + 32 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 14U; + src = *(in + 32 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 16U; + src = *(in + 32 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 18U; + src = *(in + 32 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 20U; + src = *(in + 32 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 22U; + src = *(in + 32 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 24U; + src = *(in + 32 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 26U; + src = *(in + 32 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 28U; + src = *(in + 32 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 30U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp = src; + src = *(in + 32 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 2U; + src = *(in + 32 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 4U; + src = *(in + 32 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 6U; + src = *(in + 32 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 8U; + src = *(in + 32 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 10U; + src = *(in + 32 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 12U; + src = *(in + 32 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 14U; + src = *(in + 32 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 16U; + src = *(in + 32 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 18U; + src = *(in + 32 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 20U; + src = *(in + 32 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 22U; + src = *(in + 32 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 24U; + src = *(in + 32 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 26U; + src = *(in + 32 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 28U; + src = *(in + 32 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 30U; + *(out + i) = tmp; + out -= 32; + } +} +void static ffor_3bit_32ow(const uint32_t* __restrict in, + uint32_t* __restrict out, + const uint32_t* __restrict a_base_p) { + uint32_t tmp = 0U; + uint32_t src; + for (int i = 0; i < 32; i++) { + src = *(in + 32 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp = src; + src = *(in + 32 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 3U; + src = *(in + 32 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 6U; + src = *(in + 32 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 9U; + src = *(in + 32 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 12U; + src = *(in + 32 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 15U; + src = *(in + 32 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 18U; + src = *(in + 32 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 21U; + src = *(in + 32 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 24U; + src = *(in + 32 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 27U; + src = *(in + 32 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 30U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp = src >> 2U; + src = *(in + 32 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 1U; + src = *(in + 32 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 4U; + src = *(in + 32 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 7U; + src = *(in + 32 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 10U; + src = *(in + 32 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 13U; + src = *(in + 32 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 16U; + src = *(in + 32 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 19U; + src = *(in + 32 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 22U; + src = *(in + 32 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 25U; + src = *(in + 32 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 28U; + src = *(in + 32 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 31U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp = src >> 1U; + src = *(in + 32 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 2U; + src = *(in + 32 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 5U; + src = *(in + 32 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 8U; + src = *(in + 32 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 11U; + src = *(in + 32 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 14U; + src = *(in + 32 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 17U; + src = *(in + 32 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 20U; + src = *(in + 32 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 23U; + src = *(in + 32 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 26U; + src = *(in + 32 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 29U; + *(out + i) = tmp; + out -= 64; + } +} +void static ffor_4bit_32ow(const uint32_t* __restrict in, + uint32_t* __restrict out, + const uint32_t* __restrict a_base_p) { + uint32_t tmp = 0U; + uint32_t src; + for (int i = 0; i < 32; i++) { + src = *(in + 32 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp = src; + src = *(in + 32 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 4U; + src = *(in + 32 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 8U; + src = *(in + 32 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 12U; + src = *(in + 32 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 16U; + src = *(in + 32 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 20U; + src = *(in + 32 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 24U; + src = *(in + 32 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp = src; + src = *(in + 32 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 4U; + src = *(in + 32 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 8U; + src = *(in + 32 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 12U; + src = *(in + 32 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 16U; + src = *(in + 32 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 20U; + src = *(in + 32 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 24U; + src = *(in + 32 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp = src; + src = *(in + 32 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 4U; + src = *(in + 32 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 8U; + src = *(in + 32 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 12U; + src = *(in + 32 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 16U; + src = *(in + 32 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 20U; + src = *(in + 32 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 24U; + src = *(in + 32 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp = src; + src = *(in + 32 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 4U; + src = *(in + 32 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 8U; + src = *(in + 32 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 12U; + src = *(in + 32 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 16U; + src = *(in + 32 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 20U; + src = *(in + 32 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 24U; + src = *(in + 32 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out -= 96; + } +} +void static ffor_5bit_32ow(const uint32_t* __restrict in, + uint32_t* __restrict out, + const uint32_t* __restrict a_base_p) { + uint32_t tmp = 0U; + uint32_t src; + for (int i = 0; i < 32; i++) { + src = *(in + 32 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp = src; + src = *(in + 32 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 5U; + src = *(in + 32 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 10U; + src = *(in + 32 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 15U; + src = *(in + 32 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 20U; + src = *(in + 32 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 25U; + src = *(in + 32 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 30U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp = src >> 2U; + src = *(in + 32 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 3U; + src = *(in + 32 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 8U; + src = *(in + 32 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 13U; + src = *(in + 32 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 18U; + src = *(in + 32 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 23U; + src = *(in + 32 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp = src >> 4U; + src = *(in + 32 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 1U; + src = *(in + 32 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 6U; + src = *(in + 32 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 11U; + src = *(in + 32 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 16U; + src = *(in + 32 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 21U; + src = *(in + 32 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 26U; + src = *(in + 32 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 31U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp = src >> 1U; + src = *(in + 32 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 4U; + src = *(in + 32 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 9U; + src = *(in + 32 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 14U; + src = *(in + 32 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 19U; + src = *(in + 32 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 24U; + src = *(in + 32 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 29U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp = src >> 3U; + src = *(in + 32 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 2U; + src = *(in + 32 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 7U; + src = *(in + 32 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 12U; + src = *(in + 32 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 17U; + src = *(in + 32 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 22U; + src = *(in + 32 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 27U; + *(out + i) = tmp; + out -= 128; + } +} +void static ffor_6bit_32ow(const uint32_t* __restrict in, + uint32_t* __restrict out, + const uint32_t* __restrict a_base_p) { + uint32_t tmp = 0U; + uint32_t src; + for (int i = 0; i < 32; i++) { + src = *(in + 32 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp = src; + src = *(in + 32 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 6U; + src = *(in + 32 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 12U; + src = *(in + 32 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 18U; + src = *(in + 32 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 24U; + src = *(in + 32 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 30U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp = src >> 2U; + src = *(in + 32 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 4U; + src = *(in + 32 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 10U; + src = *(in + 32 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 16U; + src = *(in + 32 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 22U; + src = *(in + 32 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp = src >> 4U; + src = *(in + 32 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 2U; + src = *(in + 32 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 8U; + src = *(in + 32 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 14U; + src = *(in + 32 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 20U; + src = *(in + 32 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 26U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp = src; + src = *(in + 32 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 6U; + src = *(in + 32 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 12U; + src = *(in + 32 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 18U; + src = *(in + 32 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 24U; + src = *(in + 32 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 30U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp = src >> 2U; + src = *(in + 32 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 4U; + src = *(in + 32 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 10U; + src = *(in + 32 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 16U; + src = *(in + 32 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 22U; + src = *(in + 32 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp = src >> 4U; + src = *(in + 32 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 2U; + src = *(in + 32 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 8U; + src = *(in + 32 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 14U; + src = *(in + 32 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 20U; + src = *(in + 32 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 26U; + *(out + i) = tmp; + out -= 160; + } +} +void static ffor_7bit_32ow(const uint32_t* __restrict in, + uint32_t* __restrict out, + const uint32_t* __restrict a_base_p) { + uint32_t tmp = 0U; + uint32_t src; + for (int i = 0; i < 32; i++) { + src = *(in + 32 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp = src; + src = *(in + 32 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 7U; + src = *(in + 32 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 14U; + src = *(in + 32 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 21U; + src = *(in + 32 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp = src >> 4U; + src = *(in + 32 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 3U; + src = *(in + 32 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 10U; + src = *(in + 32 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 17U; + src = *(in + 32 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 24U; + src = *(in + 32 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 31U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp = src >> 1U; + src = *(in + 32 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 6U; + src = *(in + 32 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 13U; + src = *(in + 32 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 20U; + src = *(in + 32 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 27U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp = src >> 5U; + src = *(in + 32 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 2U; + src = *(in + 32 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 9U; + src = *(in + 32 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 16U; + src = *(in + 32 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 23U; + src = *(in + 32 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 30U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp = src >> 2U; + src = *(in + 32 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 5U; + src = *(in + 32 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 12U; + src = *(in + 32 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 19U; + src = *(in + 32 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 26U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp = src >> 6U; + src = *(in + 32 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 1U; + src = *(in + 32 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 8U; + src = *(in + 32 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 15U; + src = *(in + 32 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 22U; + src = *(in + 32 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 29U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp = src >> 3U; + src = *(in + 32 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 4U; + src = *(in + 32 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 11U; + src = *(in + 32 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 18U; + src = *(in + 32 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 25U; + *(out + i) = tmp; + out -= 192; + } +} +void static ffor_8bit_32ow(const uint32_t* __restrict in, + uint32_t* __restrict out, + const uint32_t* __restrict a_base_p) { + uint32_t tmp = 0U; + uint32_t src; + for (int i = 0; i < 32; i++) { + src = *(in + 32 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp = src; + src = *(in + 32 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 8U; + src = *(in + 32 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 16U; + src = *(in + 32 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp = src; + src = *(in + 32 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 8U; + src = *(in + 32 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 16U; + src = *(in + 32 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp = src; + src = *(in + 32 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 8U; + src = *(in + 32 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 16U; + src = *(in + 32 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp = src; + src = *(in + 32 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 8U; + src = *(in + 32 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 16U; + src = *(in + 32 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp = src; + src = *(in + 32 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 8U; + src = *(in + 32 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 16U; + src = *(in + 32 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp = src; + src = *(in + 32 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 8U; + src = *(in + 32 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 16U; + src = *(in + 32 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp = src; + src = *(in + 32 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 8U; + src = *(in + 32 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 16U; + src = *(in + 32 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp = src; + src = *(in + 32 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 8U; + src = *(in + 32 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 16U; + src = *(in + 32 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out -= 224; + } +} +void static ffor_9bit_32ow(const uint32_t* __restrict in, + uint32_t* __restrict out, + const uint32_t* __restrict a_base_p) { + uint32_t tmp = 0U; + uint32_t src; + for (int i = 0; i < 32; i++) { + src = *(in + 32 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp = src; + src = *(in + 32 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 9U; + src = *(in + 32 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 18U; + src = *(in + 32 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 27U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp = src >> 5U; + src = *(in + 32 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 4U; + src = *(in + 32 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 13U; + src = *(in + 32 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 22U; + src = *(in + 32 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 31U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp = src >> 1U; + src = *(in + 32 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 8U; + src = *(in + 32 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 17U; + src = *(in + 32 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 26U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp = src >> 6U; + src = *(in + 32 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 3U; + src = *(in + 32 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 12U; + src = *(in + 32 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 21U; + src = *(in + 32 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 30U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp = src >> 2U; + src = *(in + 32 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 7U; + src = *(in + 32 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 16U; + src = *(in + 32 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 25U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp = src >> 7U; + src = *(in + 32 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 2U; + src = *(in + 32 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 11U; + src = *(in + 32 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 20U; + src = *(in + 32 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 29U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp = src >> 3U; + src = *(in + 32 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 6U; + src = *(in + 32 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 15U; + src = *(in + 32 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp = src >> 8U; + src = *(in + 32 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 1U; + src = *(in + 32 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 10U; + src = *(in + 32 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 19U; + src = *(in + 32 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp = src >> 4U; + src = *(in + 32 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 5U; + src = *(in + 32 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 14U; + src = *(in + 32 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 23U; + *(out + i) = tmp; + out -= 256; + } +} +void static ffor_10bit_32ow(const uint32_t* __restrict in, + uint32_t* __restrict out, + const uint32_t* __restrict a_base_p) { + uint32_t tmp = 0U; + uint32_t src; + for (int i = 0; i < 32; i++) { + src = *(in + 32 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp = src; + src = *(in + 32 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 10U; + src = *(in + 32 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 20U; + src = *(in + 32 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 30U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp = src >> 2U; + src = *(in + 32 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 8U; + src = *(in + 32 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 18U; + src = *(in + 32 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp = src >> 4U; + src = *(in + 32 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 6U; + src = *(in + 32 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 16U; + src = *(in + 32 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 26U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp = src >> 6U; + src = *(in + 32 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 4U; + src = *(in + 32 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 14U; + src = *(in + 32 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp = src >> 8U; + src = *(in + 32 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 2U; + src = *(in + 32 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 12U; + src = *(in + 32 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 22U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp = src; + src = *(in + 32 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 10U; + src = *(in + 32 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 20U; + src = *(in + 32 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 30U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp = src >> 2U; + src = *(in + 32 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 8U; + src = *(in + 32 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 18U; + src = *(in + 32 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp = src >> 4U; + src = *(in + 32 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 6U; + src = *(in + 32 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 16U; + src = *(in + 32 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 26U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp = src >> 6U; + src = *(in + 32 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 4U; + src = *(in + 32 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 14U; + src = *(in + 32 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp = src >> 8U; + src = *(in + 32 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 2U; + src = *(in + 32 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 12U; + src = *(in + 32 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 22U; + *(out + i) = tmp; + out -= 288; + } +} +void static ffor_11bit_32ow(const uint32_t* __restrict in, + uint32_t* __restrict out, + const uint32_t* __restrict a_base_p) { + uint32_t tmp = 0U; + uint32_t src; + for (int i = 0; i < 32; i++) { + src = *(in + 32 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp = src; + src = *(in + 32 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 11U; + src = *(in + 32 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 22U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp = src >> 10U; + src = *(in + 32 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 1U; + src = *(in + 32 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 12U; + src = *(in + 32 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 23U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp = src >> 9U; + src = *(in + 32 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 2U; + src = *(in + 32 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 13U; + src = *(in + 32 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp = src >> 8U; + src = *(in + 32 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 3U; + src = *(in + 32 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 14U; + src = *(in + 32 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 25U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp = src >> 7U; + src = *(in + 32 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 4U; + src = *(in + 32 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 15U; + src = *(in + 32 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 26U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp = src >> 6U; + src = *(in + 32 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 5U; + src = *(in + 32 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 16U; + src = *(in + 32 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 27U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp = src >> 5U; + src = *(in + 32 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 6U; + src = *(in + 32 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 17U; + src = *(in + 32 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp = src >> 4U; + src = *(in + 32 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 7U; + src = *(in + 32 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 18U; + src = *(in + 32 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 29U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp = src >> 3U; + src = *(in + 32 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 8U; + src = *(in + 32 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 19U; + src = *(in + 32 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 30U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp = src >> 2U; + src = *(in + 32 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 9U; + src = *(in + 32 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 20U; + src = *(in + 32 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 31U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp = src >> 1U; + src = *(in + 32 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 10U; + src = *(in + 32 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 21U; + *(out + i) = tmp; + out -= 320; + } +} +void static ffor_12bit_32ow(const uint32_t* __restrict in, + uint32_t* __restrict out, + const uint32_t* __restrict a_base_p) { + uint32_t tmp = 0U; + uint32_t src; + for (int i = 0; i < 32; i++) { + src = *(in + 32 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp = src; + src = *(in + 32 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 12U; + src = *(in + 32 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp = src >> 8U; + src = *(in + 32 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 4U; + src = *(in + 32 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 16U; + src = *(in + 32 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp = src >> 4U; + src = *(in + 32 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 8U; + src = *(in + 32 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 20U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp = src; + src = *(in + 32 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 12U; + src = *(in + 32 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp = src >> 8U; + src = *(in + 32 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 4U; + src = *(in + 32 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 16U; + src = *(in + 32 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp = src >> 4U; + src = *(in + 32 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 8U; + src = *(in + 32 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 20U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp = src; + src = *(in + 32 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 12U; + src = *(in + 32 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp = src >> 8U; + src = *(in + 32 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 4U; + src = *(in + 32 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 16U; + src = *(in + 32 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp = src >> 4U; + src = *(in + 32 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 8U; + src = *(in + 32 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 20U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp = src; + src = *(in + 32 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 12U; + src = *(in + 32 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp = src >> 8U; + src = *(in + 32 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 4U; + src = *(in + 32 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 16U; + src = *(in + 32 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp = src >> 4U; + src = *(in + 32 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 8U; + src = *(in + 32 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 20U; + *(out + i) = tmp; + out -= 352; + } +} +void static ffor_13bit_32ow(const uint32_t* __restrict in, + uint32_t* __restrict out, + const uint32_t* __restrict a_base_p) { + uint32_t tmp = 0U; + uint32_t src; + for (int i = 0; i < 32; i++) { + src = *(in + 32 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp = src; + src = *(in + 32 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 13U; + src = *(in + 32 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 26U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp = src >> 6U; + src = *(in + 32 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 7U; + src = *(in + 32 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 20U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp = src >> 12U; + src = *(in + 32 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 1U; + src = *(in + 32 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 14U; + src = *(in + 32 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 27U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp = src >> 5U; + src = *(in + 32 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 8U; + src = *(in + 32 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 21U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp = src >> 11U; + src = *(in + 32 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 2U; + src = *(in + 32 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 15U; + src = *(in + 32 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp = src >> 4U; + src = *(in + 32 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 9U; + src = *(in + 32 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 22U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp = src >> 10U; + src = *(in + 32 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 3U; + src = *(in + 32 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 16U; + src = *(in + 32 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 29U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp = src >> 3U; + src = *(in + 32 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 10U; + src = *(in + 32 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 23U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp = src >> 9U; + src = *(in + 32 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 4U; + src = *(in + 32 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 17U; + src = *(in + 32 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 30U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp = src >> 2U; + src = *(in + 32 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 11U; + src = *(in + 32 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp = src >> 8U; + src = *(in + 32 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 5U; + src = *(in + 32 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 18U; + src = *(in + 32 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 31U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp = src >> 1U; + src = *(in + 32 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 12U; + src = *(in + 32 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 25U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp = src >> 7U; + src = *(in + 32 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 6U; + src = *(in + 32 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 19U; + *(out + i) = tmp; + out -= 384; + } +} +void static ffor_14bit_32ow(const uint32_t* __restrict in, + uint32_t* __restrict out, + const uint32_t* __restrict a_base_p) { + uint32_t tmp = 0U; + uint32_t src; + for (int i = 0; i < 32; i++) { + src = *(in + 32 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp = src; + src = *(in + 32 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 14U; + src = *(in + 32 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp = src >> 4U; + src = *(in + 32 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 10U; + src = *(in + 32 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp = src >> 8U; + src = *(in + 32 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 6U; + src = *(in + 32 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 20U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp = src >> 12U; + src = *(in + 32 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 2U; + src = *(in + 32 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 16U; + src = *(in + 32 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 30U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp = src >> 2U; + src = *(in + 32 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 12U; + src = *(in + 32 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 26U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp = src >> 6U; + src = *(in + 32 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 8U; + src = *(in + 32 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 22U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp = src >> 10U; + src = *(in + 32 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 4U; + src = *(in + 32 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 18U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp = src; + src = *(in + 32 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 14U; + src = *(in + 32 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp = src >> 4U; + src = *(in + 32 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 10U; + src = *(in + 32 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp = src >> 8U; + src = *(in + 32 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 6U; + src = *(in + 32 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 20U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp = src >> 12U; + src = *(in + 32 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 2U; + src = *(in + 32 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 16U; + src = *(in + 32 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 30U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp = src >> 2U; + src = *(in + 32 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 12U; + src = *(in + 32 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 26U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp = src >> 6U; + src = *(in + 32 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 8U; + src = *(in + 32 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 22U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp = src >> 10U; + src = *(in + 32 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 4U; + src = *(in + 32 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 18U; + *(out + i) = tmp; + out -= 416; + } +} +void static ffor_15bit_32ow(const uint32_t* __restrict in, + uint32_t* __restrict out, + const uint32_t* __restrict a_base_p) { + uint32_t tmp = 0U; + uint32_t src; + for (int i = 0; i < 32; i++) { + src = *(in + 32 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp = src; + src = *(in + 32 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 15U; + src = *(in + 32 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 30U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp = src >> 2U; + src = *(in + 32 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 13U; + src = *(in + 32 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp = src >> 4U; + src = *(in + 32 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 11U; + src = *(in + 32 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 26U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp = src >> 6U; + src = *(in + 32 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 9U; + src = *(in + 32 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp = src >> 8U; + src = *(in + 32 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 7U; + src = *(in + 32 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 22U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp = src >> 10U; + src = *(in + 32 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 5U; + src = *(in + 32 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 20U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp = src >> 12U; + src = *(in + 32 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 3U; + src = *(in + 32 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 18U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp = src >> 14U; + src = *(in + 32 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 1U; + src = *(in + 32 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 16U; + src = *(in + 32 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 31U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp = src >> 1U; + src = *(in + 32 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 14U; + src = *(in + 32 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 29U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp = src >> 3U; + src = *(in + 32 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 12U; + src = *(in + 32 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 27U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp = src >> 5U; + src = *(in + 32 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 10U; + src = *(in + 32 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 25U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp = src >> 7U; + src = *(in + 32 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 8U; + src = *(in + 32 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 23U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp = src >> 9U; + src = *(in + 32 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 6U; + src = *(in + 32 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 21U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp = src >> 11U; + src = *(in + 32 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 4U; + src = *(in + 32 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 19U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp = src >> 13U; + src = *(in + 32 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 2U; + src = *(in + 32 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 17U; + *(out + i) = tmp; + out -= 448; + } +} +void static ffor_16bit_32ow(const uint32_t* __restrict in, + uint32_t* __restrict out, + const uint32_t* __restrict a_base_p) { + uint32_t tmp = 0U; + uint32_t src; + for (int i = 0; i < 32; i++) { + src = *(in + 32 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp = src; + src = *(in + 32 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp = src; + src = *(in + 32 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp = src; + src = *(in + 32 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp = src; + src = *(in + 32 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp = src; + src = *(in + 32 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp = src; + src = *(in + 32 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp = src; + src = *(in + 32 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp = src; + src = *(in + 32 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp = src; + src = *(in + 32 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp = src; + src = *(in + 32 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp = src; + src = *(in + 32 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp = src; + src = *(in + 32 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp = src; + src = *(in + 32 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp = src; + src = *(in + 32 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp = src; + src = *(in + 32 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp = src; + src = *(in + 32 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out -= 480; + } +} +void static ffor_17bit_32ow(const uint32_t* __restrict in, + uint32_t* __restrict out, + const uint32_t* __restrict a_base_p) { + uint32_t tmp = 0U; + uint32_t src; + for (int i = 0; i < 32; i++) { + src = *(in + 32 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp = src; + src = *(in + 32 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 17U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp = src >> 15U; + src = *(in + 32 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 2U; + src = *(in + 32 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 19U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp = src >> 13U; + src = *(in + 32 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 4U; + src = *(in + 32 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 21U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp = src >> 11U; + src = *(in + 32 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 6U; + src = *(in + 32 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 23U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp = src >> 9U; + src = *(in + 32 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 8U; + src = *(in + 32 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 25U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp = src >> 7U; + src = *(in + 32 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 10U; + src = *(in + 32 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 27U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp = src >> 5U; + src = *(in + 32 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 12U; + src = *(in + 32 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 29U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp = src >> 3U; + src = *(in + 32 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 14U; + src = *(in + 32 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 31U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp = src >> 1U; + src = *(in + 32 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp = src >> 16U; + src = *(in + 32 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 1U; + src = *(in + 32 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 18U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp = src >> 14U; + src = *(in + 32 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 3U; + src = *(in + 32 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 20U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp = src >> 12U; + src = *(in + 32 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 5U; + src = *(in + 32 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 22U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp = src >> 10U; + src = *(in + 32 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 7U; + src = *(in + 32 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp = src >> 8U; + src = *(in + 32 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 9U; + src = *(in + 32 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 26U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp = src >> 6U; + src = *(in + 32 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 11U; + src = *(in + 32 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp = src >> 4U; + src = *(in + 32 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 13U; + src = *(in + 32 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 30U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp = src >> 2U; + src = *(in + 32 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 15U; + *(out + i) = tmp; + out -= 512; + } +} +void static ffor_18bit_32ow(const uint32_t* __restrict in, + uint32_t* __restrict out, + const uint32_t* __restrict a_base_p) { + uint32_t tmp = 0U; + uint32_t src; + for (int i = 0; i < 32; i++) { + src = *(in + 32 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp = src; + src = *(in + 32 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 18U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp = src >> 14U; + src = *(in + 32 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 4U; + src = *(in + 32 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 22U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp = src >> 10U; + src = *(in + 32 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 8U; + src = *(in + 32 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 26U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp = src >> 6U; + src = *(in + 32 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 12U; + src = *(in + 32 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 30U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp = src >> 2U; + src = *(in + 32 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp = src >> 16U; + src = *(in + 32 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 2U; + src = *(in + 32 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 20U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp = src >> 12U; + src = *(in + 32 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 6U; + src = *(in + 32 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp = src >> 8U; + src = *(in + 32 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 10U; + src = *(in + 32 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp = src >> 4U; + src = *(in + 32 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 14U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp = src; + src = *(in + 32 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 18U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp = src >> 14U; + src = *(in + 32 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 4U; + src = *(in + 32 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 22U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp = src >> 10U; + src = *(in + 32 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 8U; + src = *(in + 32 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 26U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp = src >> 6U; + src = *(in + 32 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 12U; + src = *(in + 32 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 30U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp = src >> 2U; + src = *(in + 32 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp = src >> 16U; + src = *(in + 32 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 2U; + src = *(in + 32 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 20U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp = src >> 12U; + src = *(in + 32 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 6U; + src = *(in + 32 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp = src >> 8U; + src = *(in + 32 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 10U; + src = *(in + 32 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp = src >> 4U; + src = *(in + 32 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 14U; + *(out + i) = tmp; + out -= 544; + } +} +void static ffor_19bit_32ow(const uint32_t* __restrict in, + uint32_t* __restrict out, + const uint32_t* __restrict a_base_p) { + uint32_t tmp = 0U; + uint32_t src; + for (int i = 0; i < 32; i++) { + src = *(in + 32 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp = src; + src = *(in + 32 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 19U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp = src >> 13U; + src = *(in + 32 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 6U; + src = *(in + 32 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 25U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp = src >> 7U; + src = *(in + 32 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 12U; + src = *(in + 32 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 31U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp = src >> 1U; + src = *(in + 32 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 18U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp = src >> 14U; + src = *(in + 32 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 5U; + src = *(in + 32 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp = src >> 8U; + src = *(in + 32 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 11U; + src = *(in + 32 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 30U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp = src >> 2U; + src = *(in + 32 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 17U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp = src >> 15U; + src = *(in + 32 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 4U; + src = *(in + 32 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 23U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp = src >> 9U; + src = *(in + 32 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 10U; + src = *(in + 32 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 29U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp = src >> 3U; + src = *(in + 32 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp = src >> 16U; + src = *(in + 32 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 3U; + src = *(in + 32 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 22U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp = src >> 10U; + src = *(in + 32 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 9U; + src = *(in + 32 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp = src >> 4U; + src = *(in + 32 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 15U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp = src >> 17U; + src = *(in + 32 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 2U; + src = *(in + 32 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 21U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp = src >> 11U; + src = *(in + 32 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 8U; + src = *(in + 32 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 27U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp = src >> 5U; + src = *(in + 32 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 14U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp = src >> 18U; + src = *(in + 32 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 1U; + src = *(in + 32 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 20U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp = src >> 12U; + src = *(in + 32 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 7U; + src = *(in + 32 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 26U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp = src >> 6U; + src = *(in + 32 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 13U; + *(out + i) = tmp; + out -= 576; + } +} +void static ffor_20bit_32ow(const uint32_t* __restrict in, + uint32_t* __restrict out, + const uint32_t* __restrict a_base_p) { + uint32_t tmp = 0U; + uint32_t src; + for (int i = 0; i < 32; i++) { + src = *(in + 32 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp = src; + src = *(in + 32 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 20U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp = src >> 12U; + src = *(in + 32 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 8U; + src = *(in + 32 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp = src >> 4U; + src = *(in + 32 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp = src >> 16U; + src = *(in + 32 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 4U; + src = *(in + 32 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp = src >> 8U; + src = *(in + 32 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 12U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp = src; + src = *(in + 32 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 20U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp = src >> 12U; + src = *(in + 32 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 8U; + src = *(in + 32 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp = src >> 4U; + src = *(in + 32 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp = src >> 16U; + src = *(in + 32 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 4U; + src = *(in + 32 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp = src >> 8U; + src = *(in + 32 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 12U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp = src; + src = *(in + 32 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 20U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp = src >> 12U; + src = *(in + 32 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 8U; + src = *(in + 32 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp = src >> 4U; + src = *(in + 32 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp = src >> 16U; + src = *(in + 32 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 4U; + src = *(in + 32 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp = src >> 8U; + src = *(in + 32 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 12U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp = src; + src = *(in + 32 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 20U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp = src >> 12U; + src = *(in + 32 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 8U; + src = *(in + 32 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp = src >> 4U; + src = *(in + 32 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp = src >> 16U; + src = *(in + 32 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 4U; + src = *(in + 32 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp = src >> 8U; + src = *(in + 32 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 12U; + *(out + i) = tmp; + out -= 608; + } +} +void static ffor_21bit_32ow(const uint32_t* __restrict in, + uint32_t* __restrict out, + const uint32_t* __restrict a_base_p) { + uint32_t tmp = 0U; + uint32_t src; + for (int i = 0; i < 32; i++) { + src = *(in + 32 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp = src; + src = *(in + 32 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 21U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp = src >> 11U; + src = *(in + 32 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 10U; + src = *(in + 32 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 31U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp = src >> 1U; + src = *(in + 32 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 20U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp = src >> 12U; + src = *(in + 32 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 9U; + src = *(in + 32 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 30U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp = src >> 2U; + src = *(in + 32 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 19U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp = src >> 13U; + src = *(in + 32 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 8U; + src = *(in + 32 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 29U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp = src >> 3U; + src = *(in + 32 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 18U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp = src >> 14U; + src = *(in + 32 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 7U; + src = *(in + 32 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp = src >> 4U; + src = *(in + 32 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 17U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp = src >> 15U; + src = *(in + 32 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 6U; + src = *(in + 32 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 27U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp = src >> 5U; + src = *(in + 32 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp = src >> 16U; + src = *(in + 32 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 5U; + src = *(in + 32 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 26U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp = src >> 6U; + src = *(in + 32 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 15U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp = src >> 17U; + src = *(in + 32 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 4U; + src = *(in + 32 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 25U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp = src >> 7U; + src = *(in + 32 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 14U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp = src >> 18U; + src = *(in + 32 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 3U; + src = *(in + 32 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp = src >> 8U; + src = *(in + 32 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 13U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp = src >> 19U; + src = *(in + 32 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 2U; + src = *(in + 32 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 23U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp = src >> 9U; + src = *(in + 32 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 12U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp = src >> 20U; + src = *(in + 32 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 1U; + src = *(in + 32 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 22U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp = src >> 10U; + src = *(in + 32 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 11U; + *(out + i) = tmp; + out -= 640; + } +} +void static ffor_22bit_32ow(const uint32_t* __restrict in, + uint32_t* __restrict out, + const uint32_t* __restrict a_base_p) { + uint32_t tmp = 0U; + uint32_t src; + for (int i = 0; i < 32; i++) { + src = *(in + 32 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp = src; + src = *(in + 32 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 22U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp = src >> 10U; + src = *(in + 32 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 12U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp = src >> 20U; + src = *(in + 32 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 2U; + src = *(in + 32 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp = src >> 8U; + src = *(in + 32 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 14U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp = src >> 18U; + src = *(in + 32 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 4U; + src = *(in + 32 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 26U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp = src >> 6U; + src = *(in + 32 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp = src >> 16U; + src = *(in + 32 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 6U; + src = *(in + 32 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp = src >> 4U; + src = *(in + 32 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 18U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp = src >> 14U; + src = *(in + 32 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 8U; + src = *(in + 32 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 30U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp = src >> 2U; + src = *(in + 32 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 20U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp = src >> 12U; + src = *(in + 32 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 10U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp = src; + src = *(in + 32 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 22U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp = src >> 10U; + src = *(in + 32 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 12U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp = src >> 20U; + src = *(in + 32 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 2U; + src = *(in + 32 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp = src >> 8U; + src = *(in + 32 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 14U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp = src >> 18U; + src = *(in + 32 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 4U; + src = *(in + 32 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 26U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp = src >> 6U; + src = *(in + 32 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp = src >> 16U; + src = *(in + 32 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 6U; + src = *(in + 32 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp = src >> 4U; + src = *(in + 32 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 18U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp = src >> 14U; + src = *(in + 32 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 8U; + src = *(in + 32 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 30U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp = src >> 2U; + src = *(in + 32 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 20U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp = src >> 12U; + src = *(in + 32 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 10U; + *(out + i) = tmp; + out -= 672; + } +} +void static ffor_23bit_32ow(const uint32_t* __restrict in, + uint32_t* __restrict out, + const uint32_t* __restrict a_base_p) { + uint32_t tmp = 0U; + uint32_t src; + for (int i = 0; i < 32; i++) { + src = *(in + 32 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp = src; + src = *(in + 32 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 23U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp = src >> 9U; + src = *(in + 32 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 14U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp = src >> 18U; + src = *(in + 32 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 5U; + src = *(in + 32 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp = src >> 4U; + src = *(in + 32 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 19U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp = src >> 13U; + src = *(in + 32 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 10U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp = src >> 22U; + src = *(in + 32 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 1U; + src = *(in + 32 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp = src >> 8U; + src = *(in + 32 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 15U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp = src >> 17U; + src = *(in + 32 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 6U; + src = *(in + 32 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 29U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp = src >> 3U; + src = *(in + 32 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 20U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp = src >> 12U; + src = *(in + 32 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 11U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp = src >> 21U; + src = *(in + 32 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 2U; + src = *(in + 32 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 25U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp = src >> 7U; + src = *(in + 32 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp = src >> 16U; + src = *(in + 32 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 7U; + src = *(in + 32 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 30U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp = src >> 2U; + src = *(in + 32 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 21U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp = src >> 11U; + src = *(in + 32 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 12U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp = src >> 20U; + src = *(in + 32 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 3U; + src = *(in + 32 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 26U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp = src >> 6U; + src = *(in + 32 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 17U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp = src >> 15U; + src = *(in + 32 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 8U; + src = *(in + 32 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 31U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp = src >> 1U; + src = *(in + 32 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 22U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp = src >> 10U; + src = *(in + 32 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 13U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp = src >> 19U; + src = *(in + 32 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 4U; + src = *(in + 32 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 27U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp = src >> 5U; + src = *(in + 32 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 18U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp = src >> 14U; + src = *(in + 32 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 9U; + *(out + i) = tmp; + out -= 704; + } +} +void static ffor_24bit_32ow(const uint32_t* __restrict in, + uint32_t* __restrict out, + const uint32_t* __restrict a_base_p) { + uint32_t tmp = 0U; + uint32_t src; + for (int i = 0; i < 32; i++) { + src = *(in + 32 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp = src; + src = *(in + 32 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp = src >> 8U; + src = *(in + 32 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp = src >> 16U; + src = *(in + 32 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 8U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp = src; + src = *(in + 32 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp = src >> 8U; + src = *(in + 32 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp = src >> 16U; + src = *(in + 32 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 8U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp = src; + src = *(in + 32 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp = src >> 8U; + src = *(in + 32 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp = src >> 16U; + src = *(in + 32 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 8U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp = src; + src = *(in + 32 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp = src >> 8U; + src = *(in + 32 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp = src >> 16U; + src = *(in + 32 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 8U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp = src; + src = *(in + 32 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp = src >> 8U; + src = *(in + 32 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp = src >> 16U; + src = *(in + 32 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 8U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp = src; + src = *(in + 32 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp = src >> 8U; + src = *(in + 32 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp = src >> 16U; + src = *(in + 32 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 8U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp = src; + src = *(in + 32 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp = src >> 8U; + src = *(in + 32 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp = src >> 16U; + src = *(in + 32 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 8U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp = src; + src = *(in + 32 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp = src >> 8U; + src = *(in + 32 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp = src >> 16U; + src = *(in + 32 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 8U; + *(out + i) = tmp; + out -= 736; + } +} +void static ffor_25bit_32ow(const uint32_t* __restrict in, + uint32_t* __restrict out, + const uint32_t* __restrict a_base_p) { + uint32_t tmp = 0U; + uint32_t src; + for (int i = 0; i < 32; i++) { + src = *(in + 32 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp = src; + src = *(in + 32 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 25U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp = src >> 7U; + src = *(in + 32 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 18U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp = src >> 14U; + src = *(in + 32 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 11U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp = src >> 21U; + src = *(in + 32 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 4U; + src = *(in + 32 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 29U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp = src >> 3U; + src = *(in + 32 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 22U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp = src >> 10U; + src = *(in + 32 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 15U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp = src >> 17U; + src = *(in + 32 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 8U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp = src >> 24U; + src = *(in + 32 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 1U; + src = *(in + 32 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 26U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp = src >> 6U; + src = *(in + 32 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 19U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp = src >> 13U; + src = *(in + 32 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 12U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp = src >> 20U; + src = *(in + 32 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 5U; + src = *(in + 32 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 30U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp = src >> 2U; + src = *(in + 32 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 23U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp = src >> 9U; + src = *(in + 32 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp = src >> 16U; + src = *(in + 32 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 9U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp = src >> 23U; + src = *(in + 32 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 2U; + src = *(in + 32 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 27U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp = src >> 5U; + src = *(in + 32 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 20U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp = src >> 12U; + src = *(in + 32 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 13U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp = src >> 19U; + src = *(in + 32 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 6U; + src = *(in + 32 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 31U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp = src >> 1U; + src = *(in + 32 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp = src >> 8U; + src = *(in + 32 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 17U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp = src >> 15U; + src = *(in + 32 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 10U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp = src >> 22U; + src = *(in + 32 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 3U; + src = *(in + 32 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp = src >> 4U; + src = *(in + 32 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 21U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp = src >> 11U; + src = *(in + 32 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 14U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp = src >> 18U; + src = *(in + 32 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 7U; + *(out + i) = tmp; + out -= 768; + } +} +void static ffor_26bit_32ow(const uint32_t* __restrict in, + uint32_t* __restrict out, + const uint32_t* __restrict a_base_p) { + uint32_t tmp = 0U; + uint32_t src; + for (int i = 0; i < 32; i++) { + src = *(in + 32 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp = src; + src = *(in + 32 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 26U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp = src >> 6U; + src = *(in + 32 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 20U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp = src >> 12U; + src = *(in + 32 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 14U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp = src >> 18U; + src = *(in + 32 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 8U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp = src >> 24U; + src = *(in + 32 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 2U; + src = *(in + 32 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp = src >> 4U; + src = *(in + 32 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 22U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp = src >> 10U; + src = *(in + 32 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp = src >> 16U; + src = *(in + 32 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 10U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp = src >> 22U; + src = *(in + 32 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 4U; + src = *(in + 32 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 30U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp = src >> 2U; + src = *(in + 32 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp = src >> 8U; + src = *(in + 32 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 18U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp = src >> 14U; + src = *(in + 32 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 12U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp = src >> 20U; + src = *(in + 32 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 6U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp = src; + src = *(in + 32 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 26U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp = src >> 6U; + src = *(in + 32 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 20U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp = src >> 12U; + src = *(in + 32 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 14U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp = src >> 18U; + src = *(in + 32 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 8U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp = src >> 24U; + src = *(in + 32 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 2U; + src = *(in + 32 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp = src >> 4U; + src = *(in + 32 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 22U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp = src >> 10U; + src = *(in + 32 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp = src >> 16U; + src = *(in + 32 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 10U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp = src >> 22U; + src = *(in + 32 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 4U; + src = *(in + 32 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 30U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp = src >> 2U; + src = *(in + 32 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp = src >> 8U; + src = *(in + 32 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 18U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp = src >> 14U; + src = *(in + 32 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 12U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp = src >> 20U; + src = *(in + 32 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 6U; + *(out + i) = tmp; + out -= 800; + } +} +void static ffor_27bit_32ow(const uint32_t* __restrict in, + uint32_t* __restrict out, + const uint32_t* __restrict a_base_p) { + uint32_t tmp = 0U; + uint32_t src; + for (int i = 0; i < 32; i++) { + src = *(in + 32 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp = src; + src = *(in + 32 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 27U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp = src >> 5U; + src = *(in + 32 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 22U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp = src >> 10U; + src = *(in + 32 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 17U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp = src >> 15U; + src = *(in + 32 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 12U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp = src >> 20U; + src = *(in + 32 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 7U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp = src >> 25U; + src = *(in + 32 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 2U; + src = *(in + 32 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 29U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp = src >> 3U; + src = *(in + 32 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp = src >> 8U; + src = *(in + 32 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 19U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp = src >> 13U; + src = *(in + 32 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 14U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp = src >> 18U; + src = *(in + 32 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 9U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp = src >> 23U; + src = *(in + 32 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 4U; + src = *(in + 32 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 31U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp = src >> 1U; + src = *(in + 32 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 26U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp = src >> 6U; + src = *(in + 32 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 21U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp = src >> 11U; + src = *(in + 32 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp = src >> 16U; + src = *(in + 32 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 11U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp = src >> 21U; + src = *(in + 32 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 6U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp = src >> 26U; + src = *(in + 32 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 1U; + src = *(in + 32 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp = src >> 4U; + src = *(in + 32 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 23U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp = src >> 9U; + src = *(in + 32 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 18U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp = src >> 14U; + src = *(in + 32 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 13U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp = src >> 19U; + src = *(in + 32 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 8U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp = src >> 24U; + src = *(in + 32 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 3U; + src = *(in + 32 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 30U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp = src >> 2U; + src = *(in + 32 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 25U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp = src >> 7U; + src = *(in + 32 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 20U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp = src >> 12U; + src = *(in + 32 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 15U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp = src >> 17U; + src = *(in + 32 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 10U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp = src >> 22U; + src = *(in + 32 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 5U; + *(out + i) = tmp; + out -= 832; + } +} +void static ffor_28bit_32ow(const uint32_t* __restrict in, + uint32_t* __restrict out, + const uint32_t* __restrict a_base_p) { + uint32_t tmp = 0U; + uint32_t src; + for (int i = 0; i < 32; i++) { + src = *(in + 32 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp = src; + src = *(in + 32 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp = src >> 4U; + src = *(in + 32 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp = src >> 8U; + src = *(in + 32 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 20U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp = src >> 12U; + src = *(in + 32 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp = src >> 16U; + src = *(in + 32 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 12U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp = src >> 20U; + src = *(in + 32 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 8U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp = src >> 24U; + src = *(in + 32 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 4U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp = src; + src = *(in + 32 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp = src >> 4U; + src = *(in + 32 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp = src >> 8U; + src = *(in + 32 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 20U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp = src >> 12U; + src = *(in + 32 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp = src >> 16U; + src = *(in + 32 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 12U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp = src >> 20U; + src = *(in + 32 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 8U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp = src >> 24U; + src = *(in + 32 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 4U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp = src; + src = *(in + 32 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp = src >> 4U; + src = *(in + 32 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp = src >> 8U; + src = *(in + 32 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 20U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp = src >> 12U; + src = *(in + 32 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp = src >> 16U; + src = *(in + 32 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 12U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp = src >> 20U; + src = *(in + 32 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 8U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp = src >> 24U; + src = *(in + 32 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 4U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp = src; + src = *(in + 32 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp = src >> 4U; + src = *(in + 32 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp = src >> 8U; + src = *(in + 32 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 20U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp = src >> 12U; + src = *(in + 32 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp = src >> 16U; + src = *(in + 32 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 12U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp = src >> 20U; + src = *(in + 32 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 8U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp = src >> 24U; + src = *(in + 32 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 4U; + *(out + i) = tmp; + out -= 864; + } +} +void static ffor_29bit_32ow(const uint32_t* __restrict in, + uint32_t* __restrict out, + const uint32_t* __restrict a_base_p) { + uint32_t tmp = 0U; + uint32_t src; + for (int i = 0; i < 32; i++) { + src = *(in + 32 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp = src; + src = *(in + 32 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 29U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp = src >> 3U; + src = *(in + 32 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 26U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp = src >> 6U; + src = *(in + 32 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 23U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp = src >> 9U; + src = *(in + 32 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 20U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp = src >> 12U; + src = *(in + 32 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 17U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp = src >> 15U; + src = *(in + 32 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 14U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp = src >> 18U; + src = *(in + 32 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 11U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp = src >> 21U; + src = *(in + 32 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 8U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp = src >> 24U; + src = *(in + 32 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 5U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp = src >> 27U; + src = *(in + 32 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 2U; + src = *(in + 32 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 31U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp = src >> 1U; + src = *(in + 32 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp = src >> 4U; + src = *(in + 32 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 25U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp = src >> 7U; + src = *(in + 32 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 22U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp = src >> 10U; + src = *(in + 32 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 19U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp = src >> 13U; + src = *(in + 32 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp = src >> 16U; + src = *(in + 32 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 13U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp = src >> 19U; + src = *(in + 32 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 10U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp = src >> 22U; + src = *(in + 32 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 7U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp = src >> 25U; + src = *(in + 32 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 4U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp = src >> 28U; + src = *(in + 32 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 1U; + src = *(in + 32 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 30U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp = src >> 2U; + src = *(in + 32 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 27U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp = src >> 5U; + src = *(in + 32 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp = src >> 8U; + src = *(in + 32 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 21U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp = src >> 11U; + src = *(in + 32 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 18U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp = src >> 14U; + src = *(in + 32 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 15U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp = src >> 17U; + src = *(in + 32 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 12U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp = src >> 20U; + src = *(in + 32 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 9U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp = src >> 23U; + src = *(in + 32 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 6U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp = src >> 26U; + src = *(in + 32 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 3U; + *(out + i) = tmp; + out -= 896; + } +} +void static ffor_30bit_32ow(const uint32_t* __restrict in, + uint32_t* __restrict out, + const uint32_t* __restrict a_base_p) { + uint32_t tmp = 0U; + uint32_t src; + for (int i = 0; i < 32; i++) { + src = *(in + 32 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp = src; + src = *(in + 32 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 30U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp = src >> 2U; + src = *(in + 32 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp = src >> 4U; + src = *(in + 32 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 26U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp = src >> 6U; + src = *(in + 32 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp = src >> 8U; + src = *(in + 32 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 22U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp = src >> 10U; + src = *(in + 32 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 20U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp = src >> 12U; + src = *(in + 32 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 18U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp = src >> 14U; + src = *(in + 32 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp = src >> 16U; + src = *(in + 32 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 14U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp = src >> 18U; + src = *(in + 32 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 12U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp = src >> 20U; + src = *(in + 32 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 10U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp = src >> 22U; + src = *(in + 32 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 8U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp = src >> 24U; + src = *(in + 32 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 6U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp = src >> 26U; + src = *(in + 32 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 4U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp = src >> 28U; + src = *(in + 32 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 2U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp = src; + src = *(in + 32 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 30U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp = src >> 2U; + src = *(in + 32 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp = src >> 4U; + src = *(in + 32 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 26U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp = src >> 6U; + src = *(in + 32 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp = src >> 8U; + src = *(in + 32 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 22U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp = src >> 10U; + src = *(in + 32 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 20U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp = src >> 12U; + src = *(in + 32 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 18U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp = src >> 14U; + src = *(in + 32 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp = src >> 16U; + src = *(in + 32 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 14U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp = src >> 18U; + src = *(in + 32 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 12U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp = src >> 20U; + src = *(in + 32 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 10U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp = src >> 22U; + src = *(in + 32 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 8U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp = src >> 24U; + src = *(in + 32 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 6U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp = src >> 26U; + src = *(in + 32 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 4U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp = src >> 28U; + src = *(in + 32 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 2U; + *(out + i) = tmp; + out -= 928; + } +} +void static ffor_31bit_32ow(const uint32_t* __restrict in, + uint32_t* __restrict out, + const uint32_t* __restrict a_base_p) { + uint32_t tmp = 0U; + uint32_t src; + for (int i = 0; i < 32; i++) { + src = *(in + 32 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp = src; + src = *(in + 32 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 31U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp = src >> 1U; + src = *(in + 32 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 30U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp = src >> 2U; + src = *(in + 32 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 29U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp = src >> 3U; + src = *(in + 32 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp = src >> 4U; + src = *(in + 32 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 27U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp = src >> 5U; + src = *(in + 32 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 26U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp = src >> 6U; + src = *(in + 32 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 25U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp = src >> 7U; + src = *(in + 32 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp = src >> 8U; + src = *(in + 32 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 23U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp = src >> 9U; + src = *(in + 32 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 22U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp = src >> 10U; + src = *(in + 32 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 21U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp = src >> 11U; + src = *(in + 32 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 20U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp = src >> 12U; + src = *(in + 32 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 19U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp = src >> 13U; + src = *(in + 32 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 18U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp = src >> 14U; + src = *(in + 32 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 17U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp = src >> 15U; + src = *(in + 32 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp = src >> 16U; + src = *(in + 32 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 15U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp = src >> 17U; + src = *(in + 32 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 14U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp = src >> 18U; + src = *(in + 32 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 13U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp = src >> 19U; + src = *(in + 32 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 12U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp = src >> 20U; + src = *(in + 32 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 11U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp = src >> 21U; + src = *(in + 32 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 10U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp = src >> 22U; + src = *(in + 32 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 9U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp = src >> 23U; + src = *(in + 32 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 8U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp = src >> 24U; + src = *(in + 32 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 7U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp = src >> 25U; + src = *(in + 32 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 6U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp = src >> 26U; + src = *(in + 32 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 5U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp = src >> 27U; + src = *(in + 32 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 4U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp = src >> 28U; + src = *(in + 32 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 3U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp = src >> 29U; + src = *(in + 32 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 2U; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp = src >> 30U; + src = *(in + 32 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 1U; + *(out + i) = tmp; + out -= 960; + } +} +void static ffor_32bit_32ow(const uint32_t* __restrict in, + uint32_t* __restrict out, + const uint32_t* __restrict a_base_p) { + uint32_t tmp = 0U; + uint32_t src; + for (int i = 0; i < 32; i++) { + src = *(in + 32 * 0 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 1 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 2 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 3 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 4 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 5 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 6 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 7 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 8 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 9 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 10 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 11 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 12 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 13 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 14 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 15 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 16 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 17 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 18 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 19 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 20 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 21 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 22 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 23 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 24 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 25 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 26 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 27 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 28 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 29 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 30 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 32; + src = *(in + 32 * 31 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out -= 992; + } +} +void static ffor_0bit_64ow(const uint64_t* __restrict in, + uint64_t* __restrict out, + const uint64_t* __restrict a_base_p) {} +void static ffor_1bit_64ow(const uint64_t* __restrict in, + uint64_t* __restrict out, + const uint64_t* __restrict a_base_p) { + uint64_t tmp = 0U; + uint64_t src; + for (int i = 0; i < 16; i++) { + src = *(in + 16 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp = src; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 1U; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 2U; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 3U; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 4U; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 5U; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 6U; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 7U; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 8U; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 9U; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 10U; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 11U; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 12U; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 13U; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 14U; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 15U; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 16U; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 17U; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 18U; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 19U; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 20U; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 21U; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 22U; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 23U; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 24U; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 25U; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 26U; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 27U; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 28U; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 29U; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 30U; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 31U; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 32U; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 33U; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 34U; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 35U; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 36U; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 37U; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 38U; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 39U; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 40U; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 41U; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 42U; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 43U; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 44U; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 45U; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 46U; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 47U; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 48U; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 49U; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 50U; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 51U; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 52U; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 53U; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 54U; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 55U; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 56U; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 57U; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 58U; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 59U; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 60U; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 61U; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 62U; + src = *(in + 16 * 63 + i) - *(a_base_p); + src = src & ((1ULL << 1) - 1); + tmp |= src << 63U; + *(out + i) = tmp; + out -= 0; + } +} +void static ffor_2bit_64ow(const uint64_t* __restrict in, + uint64_t* __restrict out, + const uint64_t* __restrict a_base_p) { + uint64_t tmp = 0U; + uint64_t src; + for (int i = 0; i < 16; i++) { + src = *(in + 16 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp = src; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 2U; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 4U; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 6U; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 8U; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 10U; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 12U; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 14U; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 16U; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 18U; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 20U; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 22U; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 24U; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 26U; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 28U; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 30U; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 32U; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 34U; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 36U; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 38U; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 40U; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 42U; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 44U; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 46U; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 48U; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 50U; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 52U; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 54U; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 56U; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 58U; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 60U; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 62U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp = src; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 2U; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 4U; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 6U; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 8U; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 10U; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 12U; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 14U; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 16U; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 18U; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 20U; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 22U; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 24U; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 26U; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 28U; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 30U; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 32U; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 34U; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 36U; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 38U; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 40U; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 42U; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 44U; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 46U; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 48U; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 50U; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 52U; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 54U; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 56U; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 58U; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 60U; + src = *(in + 16 * 63 + i) - *(a_base_p); + src = src & ((1ULL << 2) - 1); + tmp |= src << 62U; + *(out + i) = tmp; + out -= 16; + } +} +void static ffor_3bit_64ow(const uint64_t* __restrict in, + uint64_t* __restrict out, + const uint64_t* __restrict a_base_p) { + uint64_t tmp = 0U; + uint64_t src; + for (int i = 0; i < 16; i++) { + src = *(in + 16 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp = src; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 3U; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 6U; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 9U; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 12U; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 15U; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 18U; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 21U; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 24U; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 27U; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 30U; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 33U; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 36U; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 39U; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 42U; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 45U; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 48U; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 51U; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 54U; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 57U; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 60U; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 63U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp = src >> 1U; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 2U; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 5U; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 8U; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 11U; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 14U; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 17U; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 20U; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 23U; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 26U; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 29U; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 32U; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 35U; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 38U; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 41U; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 44U; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 47U; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 50U; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 53U; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 56U; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 59U; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 62U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp = src >> 2U; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 1U; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 4U; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 7U; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 10U; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 13U; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 16U; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 19U; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 22U; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 25U; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 28U; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 31U; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 34U; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 37U; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 40U; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 43U; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 46U; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 49U; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 52U; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 55U; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 58U; + src = *(in + 16 * 63 + i) - *(a_base_p); + src = src & ((1ULL << 3) - 1); + tmp |= src << 61U; + *(out + i) = tmp; + out -= 32; + } +} +void static ffor_4bit_64ow(const uint64_t* __restrict in, + uint64_t* __restrict out, + const uint64_t* __restrict a_base_p) { + uint64_t tmp = 0U; + uint64_t src; + for (int i = 0; i < 16; i++) { + src = *(in + 16 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp = src; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 4U; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 8U; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 12U; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 16U; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 20U; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 24U; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 28U; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 32U; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 36U; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 40U; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 44U; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 48U; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 52U; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 56U; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp = src; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 4U; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 8U; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 12U; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 16U; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 20U; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 24U; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 28U; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 32U; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 36U; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 40U; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 44U; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 48U; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 52U; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 56U; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp = src; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 4U; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 8U; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 12U; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 16U; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 20U; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 24U; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 28U; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 32U; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 36U; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 40U; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 44U; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 48U; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 52U; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 56U; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp = src; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 4U; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 8U; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 12U; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 16U; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 20U; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 24U; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 28U; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 32U; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 36U; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 40U; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 44U; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 48U; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 52U; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 56U; + src = *(in + 16 * 63 + i) - *(a_base_p); + src = src & ((1ULL << 4) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out -= 48; + } +} +void static ffor_5bit_64ow(const uint64_t* __restrict in, + uint64_t* __restrict out, + const uint64_t* __restrict a_base_p) { + uint64_t tmp = 0U; + uint64_t src; + for (int i = 0; i < 16; i++) { + src = *(in + 16 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp = src; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 5U; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 10U; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 15U; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 20U; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 25U; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 30U; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 35U; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 40U; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 45U; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 50U; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 55U; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp = src >> 4U; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 1U; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 6U; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 11U; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 16U; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 21U; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 26U; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 31U; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 36U; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 41U; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 46U; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 51U; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 56U; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 61U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp = src >> 3U; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 2U; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 7U; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 12U; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 17U; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 22U; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 27U; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 32U; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 37U; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 42U; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 47U; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 52U; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 57U; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 62U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp = src >> 2U; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 3U; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 8U; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 13U; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 18U; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 23U; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 28U; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 33U; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 38U; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 43U; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 48U; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 53U; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 58U; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 63U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp = src >> 1U; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 4U; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 9U; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 14U; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 19U; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 24U; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 29U; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 34U; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 39U; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 44U; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 49U; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 54U; + src = *(in + 16 * 63 + i) - *(a_base_p); + src = src & ((1ULL << 5) - 1); + tmp |= src << 59U; + *(out + i) = tmp; + out -= 64; + } +} +void static ffor_6bit_64ow(const uint64_t* __restrict in, + uint64_t* __restrict out, + const uint64_t* __restrict a_base_p) { + uint64_t tmp = 0U; + uint64_t src; + for (int i = 0; i < 16; i++) { + src = *(in + 16 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp = src; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 6U; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 12U; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 18U; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 24U; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 30U; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 36U; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 42U; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 48U; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 54U; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp = src >> 4U; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 2U; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 8U; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 14U; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 20U; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 26U; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 32U; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 38U; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 44U; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 50U; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 56U; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 62U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp = src >> 2U; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 4U; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 10U; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 16U; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 22U; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 28U; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 34U; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 40U; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 46U; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 52U; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 58U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp = src; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 6U; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 12U; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 18U; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 24U; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 30U; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 36U; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 42U; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 48U; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 54U; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp = src >> 4U; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 2U; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 8U; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 14U; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 20U; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 26U; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 32U; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 38U; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 44U; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 50U; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 56U; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 62U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp = src >> 2U; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 4U; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 10U; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 16U; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 22U; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 28U; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 34U; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 40U; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 46U; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 52U; + src = *(in + 16 * 63 + i) - *(a_base_p); + src = src & ((1ULL << 6) - 1); + tmp |= src << 58U; + *(out + i) = tmp; + out -= 80; + } +} +void static ffor_7bit_64ow(const uint64_t* __restrict in, + uint64_t* __restrict out, + const uint64_t* __restrict a_base_p) { + uint64_t tmp = 0U; + uint64_t src; + for (int i = 0; i < 16; i++) { + src = *(in + 16 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp = src; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 7U; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 14U; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 21U; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 28U; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 35U; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 42U; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 49U; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 56U; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 63U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp = src >> 1U; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 6U; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 13U; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 20U; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 27U; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 34U; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 41U; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 48U; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 55U; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 62U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp = src >> 2U; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 5U; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 12U; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 19U; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 26U; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 33U; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 40U; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 47U; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 54U; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 61U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp = src >> 3U; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 4U; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 11U; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 18U; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 25U; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 32U; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 39U; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 46U; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 53U; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp = src >> 4U; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 3U; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 10U; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 17U; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 24U; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 31U; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 38U; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 45U; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 52U; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 59U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp = src >> 5U; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 2U; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 9U; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 16U; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 23U; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 30U; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 37U; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 44U; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 51U; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 58U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp = src >> 6U; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 1U; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 8U; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 15U; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 22U; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 29U; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 36U; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 43U; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 50U; + src = *(in + 16 * 63 + i) - *(a_base_p); + src = src & ((1ULL << 7) - 1); + tmp |= src << 57U; + *(out + i) = tmp; + out -= 96; + } +} +void static ffor_8bit_64ow(const uint64_t* __restrict in, + uint64_t* __restrict out, + const uint64_t* __restrict a_base_p) { + uint64_t tmp = 0U; + uint64_t src; + for (int i = 0; i < 16; i++) { + src = *(in + 16 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp = src; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 8U; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 16U; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 24U; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 32U; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 40U; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 48U; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp = src; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 8U; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 16U; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 24U; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 32U; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 40U; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 48U; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp = src; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 8U; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 16U; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 24U; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 32U; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 40U; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 48U; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp = src; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 8U; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 16U; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 24U; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 32U; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 40U; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 48U; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp = src; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 8U; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 16U; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 24U; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 32U; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 40U; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 48U; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp = src; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 8U; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 16U; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 24U; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 32U; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 40U; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 48U; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp = src; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 8U; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 16U; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 24U; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 32U; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 40U; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 48U; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp = src; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 8U; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 16U; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 24U; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 32U; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 40U; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 48U; + src = *(in + 16 * 63 + i) - *(a_base_p); + src = src & ((1ULL << 8) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out -= 112; + } +} +void static ffor_9bit_64ow(const uint64_t* __restrict in, + uint64_t* __restrict out, + const uint64_t* __restrict a_base_p) { + uint64_t tmp = 0U; + uint64_t src; + for (int i = 0; i < 16; i++) { + src = *(in + 16 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp = src; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 9U; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 18U; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 27U; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 36U; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 45U; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 54U; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 63U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp = src >> 1U; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 8U; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 17U; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 26U; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 35U; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 44U; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 53U; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 62U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp = src >> 2U; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 7U; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 16U; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 25U; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 34U; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 43U; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 52U; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 61U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp = src >> 3U; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 6U; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 15U; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 24U; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 33U; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 42U; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 51U; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp = src >> 4U; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 5U; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 14U; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 23U; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 32U; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 41U; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 50U; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 59U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp = src >> 5U; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 4U; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 13U; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 22U; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 31U; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 40U; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 49U; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 58U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp = src >> 6U; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 3U; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 12U; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 21U; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 30U; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 39U; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 48U; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 57U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp = src >> 7U; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 2U; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 11U; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 20U; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 29U; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 38U; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 47U; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp = src >> 8U; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 1U; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 10U; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 19U; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 28U; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 37U; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 46U; + src = *(in + 16 * 63 + i) - *(a_base_p); + src = src & ((1ULL << 9) - 1); + tmp |= src << 55U; + *(out + i) = tmp; + out -= 128; + } +} +void static ffor_10bit_64ow(const uint64_t* __restrict in, + uint64_t* __restrict out, + const uint64_t* __restrict a_base_p) { + uint64_t tmp = 0U; + uint64_t src; + for (int i = 0; i < 16; i++) { + src = *(in + 16 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp = src; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 10U; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 20U; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 30U; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 40U; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 50U; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp = src >> 4U; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 6U; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 16U; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 26U; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 36U; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 46U; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp = src >> 8U; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 2U; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 12U; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 22U; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 32U; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 42U; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 52U; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 62U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp = src >> 2U; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 8U; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 18U; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 28U; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 38U; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 48U; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 58U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp = src >> 6U; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 4U; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 14U; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 24U; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 34U; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 44U; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 54U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp = src; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 10U; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 20U; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 30U; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 40U; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 50U; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp = src >> 4U; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 6U; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 16U; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 26U; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 36U; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 46U; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp = src >> 8U; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 2U; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 12U; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 22U; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 32U; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 42U; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 52U; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 62U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp = src >> 2U; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 8U; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 18U; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 28U; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 38U; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 48U; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 58U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp = src >> 6U; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 4U; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 14U; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 24U; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 34U; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 44U; + src = *(in + 16 * 63 + i) - *(a_base_p); + src = src & ((1ULL << 10) - 1); + tmp |= src << 54U; + *(out + i) = tmp; + out -= 144; + } +} +void static ffor_11bit_64ow(const uint64_t* __restrict in, + uint64_t* __restrict out, + const uint64_t* __restrict a_base_p) { + uint64_t tmp = 0U; + uint64_t src; + for (int i = 0; i < 16; i++) { + src = *(in + 16 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp = src; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 11U; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 22U; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 33U; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 44U; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 55U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp = src >> 9U; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 2U; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 13U; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 24U; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 35U; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 46U; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 57U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp = src >> 7U; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 4U; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 15U; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 26U; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 37U; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 48U; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 59U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp = src >> 5U; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 6U; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 17U; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 28U; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 39U; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 50U; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 61U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp = src >> 3U; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 8U; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 19U; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 30U; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 41U; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 52U; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 63U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp = src >> 1U; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 10U; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 21U; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 32U; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 43U; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 54U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp = src >> 10U; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 1U; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 12U; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 23U; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 34U; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 45U; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp = src >> 8U; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 3U; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 14U; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 25U; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 36U; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 47U; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 58U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp = src >> 6U; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 5U; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 16U; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 27U; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 38U; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 49U; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp = src >> 4U; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 7U; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 18U; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 29U; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 40U; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 51U; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 62U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp = src >> 2U; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 9U; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 20U; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 31U; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 42U; + src = *(in + 16 * 63 + i) - *(a_base_p); + src = src & ((1ULL << 11) - 1); + tmp |= src << 53U; + *(out + i) = tmp; + out -= 160; + } +} +void static ffor_12bit_64ow(const uint64_t* __restrict in, + uint64_t* __restrict out, + const uint64_t* __restrict a_base_p) { + uint64_t tmp = 0U; + uint64_t src; + for (int i = 0; i < 16; i++) { + src = *(in + 16 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp = src; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 12U; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 24U; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 36U; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 48U; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp = src >> 4U; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 8U; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 20U; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 32U; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 44U; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp = src >> 8U; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 4U; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 16U; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 28U; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 40U; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp = src; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 12U; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 24U; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 36U; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 48U; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp = src >> 4U; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 8U; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 20U; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 32U; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 44U; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp = src >> 8U; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 4U; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 16U; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 28U; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 40U; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp = src; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 12U; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 24U; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 36U; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 48U; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp = src >> 4U; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 8U; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 20U; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 32U; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 44U; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp = src >> 8U; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 4U; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 16U; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 28U; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 40U; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp = src; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 12U; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 24U; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 36U; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 48U; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp = src >> 4U; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 8U; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 20U; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 32U; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 44U; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp = src >> 8U; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 4U; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 16U; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 28U; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 40U; + src = *(in + 16 * 63 + i) - *(a_base_p); + src = src & ((1ULL << 12) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out -= 176; + } +} +void static ffor_13bit_64ow(const uint64_t* __restrict in, + uint64_t* __restrict out, + const uint64_t* __restrict a_base_p) { + uint64_t tmp = 0U; + uint64_t src; + for (int i = 0; i < 16; i++) { + src = *(in + 16 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp = src; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 13U; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 26U; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 39U; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp = src >> 12U; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 1U; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 14U; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 27U; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 40U; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 53U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp = src >> 11U; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 2U; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 15U; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 28U; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 41U; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 54U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp = src >> 10U; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 3U; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 16U; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 29U; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 42U; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 55U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp = src >> 9U; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 4U; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 17U; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 30U; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 43U; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp = src >> 8U; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 5U; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 18U; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 31U; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 44U; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 57U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp = src >> 7U; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 6U; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 19U; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 32U; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 45U; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 58U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp = src >> 6U; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 7U; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 20U; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 33U; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 46U; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 59U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp = src >> 5U; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 8U; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 21U; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 34U; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 47U; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp = src >> 4U; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 9U; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 22U; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 35U; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 48U; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 61U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp = src >> 3U; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 10U; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 23U; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 36U; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 49U; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 62U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp = src >> 2U; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 11U; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 24U; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 37U; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 50U; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 63U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp = src >> 1U; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 12U; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 25U; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 38U; + src = *(in + 16 * 63 + i) - *(a_base_p); + src = src & ((1ULL << 13) - 1); + tmp |= src << 51U; + *(out + i) = tmp; + out -= 192; + } +} +void static ffor_14bit_64ow(const uint64_t* __restrict in, + uint64_t* __restrict out, + const uint64_t* __restrict a_base_p) { + uint64_t tmp = 0U; + uint64_t src; + for (int i = 0; i < 16; i++) { + src = *(in + 16 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp = src; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 14U; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 28U; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 42U; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp = src >> 8U; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 6U; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 20U; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 34U; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 48U; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 62U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp = src >> 2U; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 12U; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 26U; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 40U; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 54U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp = src >> 10U; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 4U; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 18U; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 32U; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 46U; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp = src >> 4U; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 10U; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 24U; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 38U; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp = src >> 12U; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 2U; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 16U; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 30U; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 44U; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 58U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp = src >> 6U; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 8U; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 22U; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 36U; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 50U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp = src; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 14U; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 28U; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 42U; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp = src >> 8U; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 6U; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 20U; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 34U; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 48U; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 62U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp = src >> 2U; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 12U; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 26U; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 40U; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 54U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp = src >> 10U; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 4U; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 18U; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 32U; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 46U; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp = src >> 4U; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 10U; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 24U; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 38U; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp = src >> 12U; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 2U; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 16U; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 30U; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 44U; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 58U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp = src >> 6U; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 8U; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 22U; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 36U; + src = *(in + 16 * 63 + i) - *(a_base_p); + src = src & ((1ULL << 14) - 1); + tmp |= src << 50U; + *(out + i) = tmp; + out -= 208; + } +} +void static ffor_15bit_64ow(const uint64_t* __restrict in, + uint64_t* __restrict out, + const uint64_t* __restrict a_base_p) { + uint64_t tmp = 0U; + uint64_t src; + for (int i = 0; i < 16; i++) { + src = *(in + 16 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp = src; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 15U; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 30U; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 45U; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp = src >> 4U; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 11U; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 26U; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 41U; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp = src >> 8U; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 7U; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 22U; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 37U; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp = src >> 12U; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 3U; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 18U; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 33U; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 48U; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 63U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp = src >> 1U; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 14U; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 29U; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 44U; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 59U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp = src >> 5U; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 10U; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 25U; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 40U; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 55U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp = src >> 9U; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 6U; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 21U; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 36U; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 51U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp = src >> 13U; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 2U; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 17U; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 32U; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 47U; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 62U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp = src >> 2U; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 13U; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 28U; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 43U; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 58U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp = src >> 6U; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 9U; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 24U; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 39U; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 54U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp = src >> 10U; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 5U; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 20U; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 35U; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 50U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp = src >> 14U; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 1U; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 16U; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 31U; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 46U; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 61U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp = src >> 3U; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 12U; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 27U; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 42U; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 57U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp = src >> 7U; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 8U; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 23U; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 38U; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 53U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp = src >> 11U; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 4U; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 19U; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 34U; + src = *(in + 16 * 63 + i) - *(a_base_p); + src = src & ((1ULL << 15) - 1); + tmp |= src << 49U; + *(out + i) = tmp; + out -= 224; + } +} +void static ffor_16bit_64ow(const uint64_t* __restrict in, + uint64_t* __restrict out, + const uint64_t* __restrict a_base_p) { + uint64_t tmp = 0U; + uint64_t src; + for (int i = 0; i < 16; i++) { + src = *(in + 16 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp = src; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp |= src << 16U; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp |= src << 32U; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp = src; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp |= src << 16U; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp |= src << 32U; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp = src; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp |= src << 16U; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp |= src << 32U; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp = src; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp |= src << 16U; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp |= src << 32U; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp = src; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp |= src << 16U; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp |= src << 32U; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp = src; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp |= src << 16U; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp |= src << 32U; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp = src; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp |= src << 16U; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp |= src << 32U; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp = src; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp |= src << 16U; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp |= src << 32U; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp = src; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp |= src << 16U; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp |= src << 32U; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp = src; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp |= src << 16U; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp |= src << 32U; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp = src; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp |= src << 16U; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp |= src << 32U; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp = src; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp |= src << 16U; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp |= src << 32U; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp = src; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp |= src << 16U; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp |= src << 32U; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp = src; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp |= src << 16U; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp |= src << 32U; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp = src; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp |= src << 16U; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp |= src << 32U; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp = src; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp |= src << 16U; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp |= src << 32U; + src = *(in + 16 * 63 + i) - *(a_base_p); + src = src & ((1ULL << 16) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out -= 240; + } +} +void static ffor_17bit_64ow(const uint64_t* __restrict in, + uint64_t* __restrict out, + const uint64_t* __restrict a_base_p) { + uint64_t tmp = 0U; + uint64_t src; + for (int i = 0; i < 16; i++) { + src = *(in + 16 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp = src; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 17U; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 34U; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 51U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp = src >> 13U; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 4U; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 21U; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 38U; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 55U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp = src >> 9U; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 8U; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 25U; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 42U; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 59U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp = src >> 5U; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 12U; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 29U; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 46U; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 63U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp = src >> 1U; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 16U; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 33U; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 50U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp = src >> 14U; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 3U; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 20U; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 37U; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 54U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp = src >> 10U; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 7U; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 24U; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 41U; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 58U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp = src >> 6U; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 11U; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 28U; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 45U; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 62U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp = src >> 2U; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 15U; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 32U; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 49U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp = src >> 15U; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 2U; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 19U; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 36U; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 53U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp = src >> 11U; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 6U; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 23U; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 40U; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 57U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp = src >> 7U; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 10U; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 27U; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 44U; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 61U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp = src >> 3U; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 14U; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 31U; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp = src >> 16U; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 1U; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 18U; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 35U; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp = src >> 12U; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 5U; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 22U; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 39U; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp = src >> 8U; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 9U; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 26U; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 43U; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp = src >> 4U; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 13U; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 30U; + src = *(in + 16 * 63 + i) - *(a_base_p); + src = src & ((1ULL << 17) - 1); + tmp |= src << 47U; + *(out + i) = tmp; + out -= 256; + } +} +void static ffor_18bit_64ow(const uint64_t* __restrict in, + uint64_t* __restrict out, + const uint64_t* __restrict a_base_p) { + uint64_t tmp = 0U; + uint64_t src; + for (int i = 0; i < 16; i++) { + src = *(in + 16 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp = src; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 18U; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 36U; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 54U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp = src >> 10U; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 8U; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 26U; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 44U; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 62U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp = src >> 2U; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 16U; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 34U; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp = src >> 12U; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 6U; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 24U; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 42U; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp = src >> 4U; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 14U; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 32U; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 50U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp = src >> 14U; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 4U; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 22U; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 40U; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 58U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp = src >> 6U; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 12U; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 30U; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp = src >> 16U; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 2U; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 20U; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 38U; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp = src >> 8U; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 10U; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 28U; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 46U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp = src; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 18U; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 36U; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 54U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp = src >> 10U; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 8U; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 26U; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 44U; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 62U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp = src >> 2U; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 16U; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 34U; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp = src >> 12U; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 6U; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 24U; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 42U; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp = src >> 4U; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 14U; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 32U; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 50U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp = src >> 14U; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 4U; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 22U; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 40U; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 58U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp = src >> 6U; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 12U; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 30U; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp = src >> 16U; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 2U; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 20U; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 38U; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp = src >> 8U; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 10U; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 28U; + src = *(in + 16 * 63 + i) - *(a_base_p); + src = src & ((1ULL << 18) - 1); + tmp |= src << 46U; + *(out + i) = tmp; + out -= 272; + } +} +void static ffor_19bit_64ow(const uint64_t* __restrict in, + uint64_t* __restrict out, + const uint64_t* __restrict a_base_p) { + uint64_t tmp = 0U; + uint64_t src; + for (int i = 0; i < 16; i++) { + src = *(in + 16 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp = src; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 19U; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 38U; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 57U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp = src >> 7U; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 12U; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 31U; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 50U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp = src >> 14U; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 5U; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 24U; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 43U; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 62U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp = src >> 2U; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 17U; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 36U; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 55U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp = src >> 9U; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 10U; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 29U; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp = src >> 16U; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 3U; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 22U; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 41U; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp = src >> 4U; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 15U; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 34U; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 53U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp = src >> 11U; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 8U; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 27U; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 46U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp = src >> 18U; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 1U; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 20U; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 39U; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 58U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp = src >> 6U; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 13U; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 32U; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 51U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp = src >> 13U; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 6U; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 25U; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 44U; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 63U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp = src >> 1U; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 18U; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 37U; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp = src >> 8U; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 11U; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 30U; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 49U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp = src >> 15U; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 4U; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 23U; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 42U; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 61U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp = src >> 3U; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 16U; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 35U; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 54U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp = src >> 10U; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 9U; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 28U; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 47U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp = src >> 17U; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 2U; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 21U; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 40U; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 59U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp = src >> 5U; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 14U; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 33U; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp = src >> 12U; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 7U; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 26U; + src = *(in + 16 * 63 + i) - *(a_base_p); + src = src & ((1ULL << 19) - 1); + tmp |= src << 45U; + *(out + i) = tmp; + out -= 288; + } +} +void static ffor_20bit_64ow(const uint64_t* __restrict in, + uint64_t* __restrict out, + const uint64_t* __restrict a_base_p) { + uint64_t tmp = 0U; + uint64_t src; + for (int i = 0; i < 16; i++) { + src = *(in + 16 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp = src; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 20U; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 40U; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp = src >> 4U; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 16U; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 36U; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp = src >> 8U; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 12U; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 32U; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp = src >> 12U; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 8U; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 28U; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp = src >> 16U; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 4U; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 24U; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 44U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp = src; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 20U; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 40U; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp = src >> 4U; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 16U; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 36U; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp = src >> 8U; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 12U; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 32U; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp = src >> 12U; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 8U; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 28U; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp = src >> 16U; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 4U; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 24U; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 44U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp = src; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 20U; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 40U; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp = src >> 4U; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 16U; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 36U; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp = src >> 8U; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 12U; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 32U; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp = src >> 12U; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 8U; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 28U; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp = src >> 16U; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 4U; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 24U; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 44U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp = src; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 20U; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 40U; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp = src >> 4U; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 16U; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 36U; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp = src >> 8U; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 12U; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 32U; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp = src >> 12U; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 8U; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 28U; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp = src >> 16U; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 4U; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 24U; + src = *(in + 16 * 63 + i) - *(a_base_p); + src = src & ((1ULL << 20) - 1); + tmp |= src << 44U; + *(out + i) = tmp; + out -= 304; + } +} +void static ffor_21bit_64ow(const uint64_t* __restrict in, + uint64_t* __restrict out, + const uint64_t* __restrict a_base_p) { + uint64_t tmp = 0U; + uint64_t src; + for (int i = 0; i < 16; i++) { + src = *(in + 16 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp = src; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 21U; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 42U; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 63U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp = src >> 1U; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 20U; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 41U; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 62U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp = src >> 2U; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 19U; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 40U; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 61U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp = src >> 3U; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 18U; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 39U; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp = src >> 4U; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 17U; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 38U; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 59U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp = src >> 5U; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 16U; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 37U; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 58U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp = src >> 6U; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 15U; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 36U; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 57U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp = src >> 7U; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 14U; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 35U; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp = src >> 8U; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 13U; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 34U; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 55U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp = src >> 9U; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 12U; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 33U; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 54U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp = src >> 10U; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 11U; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 32U; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 53U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp = src >> 11U; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 10U; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 31U; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp = src >> 12U; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 9U; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 30U; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 51U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp = src >> 13U; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 8U; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 29U; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 50U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp = src >> 14U; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 7U; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 28U; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 49U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp = src >> 15U; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 6U; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 27U; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp = src >> 16U; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 5U; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 26U; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 47U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp = src >> 17U; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 4U; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 25U; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 46U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp = src >> 18U; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 3U; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 24U; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 45U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp = src >> 19U; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 2U; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 23U; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 44U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp = src >> 20U; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 1U; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 22U; + src = *(in + 16 * 63 + i) - *(a_base_p); + src = src & ((1ULL << 21) - 1); + tmp |= src << 43U; + *(out + i) = tmp; + out -= 320; + } +} +void static ffor_22bit_64ow(const uint64_t* __restrict in, + uint64_t* __restrict out, + const uint64_t* __restrict a_base_p) { + uint64_t tmp = 0U; + uint64_t src; + for (int i = 0; i < 16; i++) { + src = *(in + 16 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp = src; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 22U; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 44U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp = src >> 20U; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 2U; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 24U; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 46U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp = src >> 18U; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 4U; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 26U; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp = src >> 16U; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 6U; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 28U; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 50U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp = src >> 14U; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 8U; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 30U; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp = src >> 12U; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 10U; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 32U; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 54U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp = src >> 10U; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 12U; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 34U; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp = src >> 8U; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 14U; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 36U; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 58U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp = src >> 6U; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 16U; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 38U; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp = src >> 4U; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 18U; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 40U; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 62U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp = src >> 2U; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 20U; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 42U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp = src; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 22U; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 44U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp = src >> 20U; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 2U; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 24U; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 46U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp = src >> 18U; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 4U; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 26U; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp = src >> 16U; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 6U; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 28U; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 50U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp = src >> 14U; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 8U; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 30U; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp = src >> 12U; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 10U; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 32U; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 54U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp = src >> 10U; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 12U; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 34U; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp = src >> 8U; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 14U; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 36U; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 58U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp = src >> 6U; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 16U; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 38U; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp = src >> 4U; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 18U; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 40U; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 62U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp = src >> 2U; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 20U; + src = *(in + 16 * 63 + i) - *(a_base_p); + src = src & ((1ULL << 22) - 1); + tmp |= src << 42U; + *(out + i) = tmp; + out -= 336; + } +} +void static ffor_23bit_64ow(const uint64_t* __restrict in, + uint64_t* __restrict out, + const uint64_t* __restrict a_base_p) { + uint64_t tmp = 0U; + uint64_t src; + for (int i = 0; i < 16; i++) { + src = *(in + 16 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp = src; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 23U; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 46U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp = src >> 18U; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 5U; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 28U; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 51U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp = src >> 13U; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 10U; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 33U; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp = src >> 8U; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 15U; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 38U; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 61U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp = src >> 3U; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 20U; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 43U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp = src >> 21U; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 2U; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 25U; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp = src >> 16U; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 7U; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 30U; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 53U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp = src >> 11U; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 12U; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 35U; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 58U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp = src >> 6U; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 17U; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 40U; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 63U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp = src >> 1U; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 22U; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 45U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp = src >> 19U; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 4U; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 27U; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 50U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp = src >> 14U; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 9U; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 32U; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 55U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp = src >> 9U; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 14U; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 37U; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp = src >> 4U; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 19U; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 42U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp = src >> 22U; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 1U; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 24U; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 47U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp = src >> 17U; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 6U; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 29U; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp = src >> 12U; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 11U; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 34U; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 57U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp = src >> 7U; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 16U; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 39U; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 62U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp = src >> 2U; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 21U; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 44U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp = src >> 20U; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 3U; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 26U; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 49U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp = src >> 15U; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 8U; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 31U; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 54U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp = src >> 10U; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 13U; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 36U; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 59U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp = src >> 5U; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 18U; + src = *(in + 16 * 63 + i) - *(a_base_p); + src = src & ((1ULL << 23) - 1); + tmp |= src << 41U; + *(out + i) = tmp; + out -= 352; + } +} +void static ffor_24bit_64ow(const uint64_t* __restrict in, + uint64_t* __restrict out, + const uint64_t* __restrict a_base_p) { + uint64_t tmp = 0U; + uint64_t src; + for (int i = 0; i < 16; i++) { + src = *(in + 16 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp = src; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 24U; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp = src >> 16U; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 8U; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 32U; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp = src >> 8U; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 16U; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp = src; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 24U; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp = src >> 16U; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 8U; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 32U; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp = src >> 8U; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 16U; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp = src; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 24U; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp = src >> 16U; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 8U; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 32U; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp = src >> 8U; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 16U; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp = src; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 24U; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp = src >> 16U; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 8U; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 32U; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp = src >> 8U; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 16U; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp = src; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 24U; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp = src >> 16U; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 8U; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 32U; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp = src >> 8U; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 16U; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp = src; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 24U; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp = src >> 16U; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 8U; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 32U; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp = src >> 8U; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 16U; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp = src; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 24U; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp = src >> 16U; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 8U; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 32U; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp = src >> 8U; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 16U; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp = src; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 24U; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp = src >> 16U; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 8U; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 32U; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp = src >> 8U; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 16U; + src = *(in + 16 * 63 + i) - *(a_base_p); + src = src & ((1ULL << 24) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out -= 368; + } +} +void static ffor_25bit_64ow(const uint64_t* __restrict in, + uint64_t* __restrict out, + const uint64_t* __restrict a_base_p) { + uint64_t tmp = 0U; + uint64_t src; + for (int i = 0; i < 16; i++) { + src = *(in + 16 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp = src; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 25U; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 50U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp = src >> 14U; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 11U; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 36U; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 61U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp = src >> 3U; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 22U; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 47U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp = src >> 17U; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 8U; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 33U; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 58U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp = src >> 6U; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 19U; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 44U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp = src >> 20U; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 5U; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 30U; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 55U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp = src >> 9U; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 16U; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 41U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp = src >> 23U; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 2U; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 27U; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp = src >> 12U; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 13U; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 38U; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 63U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp = src >> 1U; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 24U; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 49U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp = src >> 15U; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 10U; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 35U; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp = src >> 4U; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 21U; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 46U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp = src >> 18U; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 7U; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 32U; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 57U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp = src >> 7U; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 18U; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 43U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp = src >> 21U; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 4U; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 29U; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 54U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp = src >> 10U; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 15U; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp = src >> 24U; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 1U; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 26U; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 51U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp = src >> 13U; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 12U; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 37U; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 62U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp = src >> 2U; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 23U; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp = src >> 16U; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 9U; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 34U; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 59U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp = src >> 5U; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 20U; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 45U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp = src >> 19U; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 6U; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 31U; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp = src >> 8U; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 17U; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 42U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp = src >> 22U; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 3U; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 28U; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 53U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp = src >> 11U; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 14U; + src = *(in + 16 * 63 + i) - *(a_base_p); + src = src & ((1ULL << 25) - 1); + tmp |= src << 39U; + *(out + i) = tmp; + out -= 384; + } +} +void static ffor_26bit_64ow(const uint64_t* __restrict in, + uint64_t* __restrict out, + const uint64_t* __restrict a_base_p) { + uint64_t tmp = 0U; + uint64_t src; + for (int i = 0; i < 16; i++) { + src = *(in + 16 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp = src; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 26U; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp = src >> 12U; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 14U; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp = src >> 24U; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 2U; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 28U; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 54U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp = src >> 10U; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 16U; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 42U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp = src >> 22U; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 4U; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 30U; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp = src >> 8U; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 18U; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 44U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp = src >> 20U; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 6U; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 32U; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 58U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp = src >> 6U; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 20U; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 46U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp = src >> 18U; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 8U; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 34U; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp = src >> 4U; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 22U; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp = src >> 16U; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 10U; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 36U; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 62U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp = src >> 2U; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 24U; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 50U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp = src >> 14U; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 12U; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 38U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp = src; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 26U; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp = src >> 12U; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 14U; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp = src >> 24U; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 2U; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 28U; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 54U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp = src >> 10U; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 16U; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 42U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp = src >> 22U; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 4U; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 30U; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp = src >> 8U; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 18U; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 44U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp = src >> 20U; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 6U; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 32U; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 58U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp = src >> 6U; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 20U; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 46U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp = src >> 18U; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 8U; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 34U; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp = src >> 4U; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 22U; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp = src >> 16U; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 10U; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 36U; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 62U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp = src >> 2U; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 24U; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 50U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp = src >> 14U; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 12U; + src = *(in + 16 * 63 + i) - *(a_base_p); + src = src & ((1ULL << 26) - 1); + tmp |= src << 38U; + *(out + i) = tmp; + out -= 400; + } +} +void static ffor_27bit_64ow(const uint64_t* __restrict in, + uint64_t* __restrict out, + const uint64_t* __restrict a_base_p) { + uint64_t tmp = 0U; + uint64_t src; + for (int i = 0; i < 16; i++) { + src = *(in + 16 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp = src; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 27U; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 54U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp = src >> 10U; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 17U; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 44U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp = src >> 20U; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 7U; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 34U; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 61U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp = src >> 3U; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 24U; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 51U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp = src >> 13U; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 14U; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 41U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp = src >> 23U; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 4U; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 31U; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 58U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp = src >> 6U; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 21U; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp = src >> 16U; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 11U; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 38U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp = src >> 26U; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 1U; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 28U; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 55U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp = src >> 9U; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 18U; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 45U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp = src >> 19U; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 8U; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 35U; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 62U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp = src >> 2U; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 25U; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp = src >> 12U; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 15U; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 42U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp = src >> 22U; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 5U; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 32U; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 59U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp = src >> 5U; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 22U; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 49U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp = src >> 15U; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 12U; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 39U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp = src >> 25U; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 2U; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 29U; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp = src >> 8U; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 19U; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 46U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp = src >> 18U; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 9U; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 36U; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 63U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp = src >> 1U; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 26U; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 53U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp = src >> 11U; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 16U; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 43U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp = src >> 21U; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 6U; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 33U; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp = src >> 4U; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 23U; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 50U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp = src >> 14U; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 13U; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp = src >> 24U; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 3U; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 30U; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 57U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp = src >> 7U; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 20U; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 47U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp = src >> 17U; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 10U; + src = *(in + 16 * 63 + i) - *(a_base_p); + src = src & ((1ULL << 27) - 1); + tmp |= src << 37U; + *(out + i) = tmp; + out -= 416; + } +} +void static ffor_28bit_64ow(const uint64_t* __restrict in, + uint64_t* __restrict out, + const uint64_t* __restrict a_base_p) { + uint64_t tmp = 0U; + uint64_t src; + for (int i = 0; i < 16; i++) { + src = *(in + 16 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp = src; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 28U; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp = src >> 8U; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 20U; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp = src >> 16U; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 12U; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp = src >> 24U; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 4U; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 32U; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp = src >> 4U; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 24U; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp = src >> 12U; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 16U; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 44U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp = src >> 20U; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 8U; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 36U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp = src; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 28U; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp = src >> 8U; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 20U; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp = src >> 16U; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 12U; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp = src >> 24U; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 4U; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 32U; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp = src >> 4U; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 24U; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp = src >> 12U; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 16U; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 44U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp = src >> 20U; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 8U; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 36U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp = src; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 28U; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp = src >> 8U; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 20U; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp = src >> 16U; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 12U; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp = src >> 24U; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 4U; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 32U; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp = src >> 4U; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 24U; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp = src >> 12U; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 16U; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 44U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp = src >> 20U; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 8U; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 36U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp = src; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 28U; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp = src >> 8U; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 20U; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp = src >> 16U; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 12U; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp = src >> 24U; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 4U; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 32U; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp = src >> 4U; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 24U; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp = src >> 12U; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 16U; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 44U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp = src >> 20U; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 8U; + src = *(in + 16 * 63 + i) - *(a_base_p); + src = src & ((1ULL << 28) - 1); + tmp |= src << 36U; + *(out + i) = tmp; + out -= 432; + } +} +void static ffor_29bit_64ow(const uint64_t* __restrict in, + uint64_t* __restrict out, + const uint64_t* __restrict a_base_p) { + uint64_t tmp = 0U; + uint64_t src; + for (int i = 0; i < 16; i++) { + src = *(in + 16 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp = src; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 29U; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 58U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp = src >> 6U; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 23U; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp = src >> 12U; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 17U; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 46U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp = src >> 18U; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 11U; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp = src >> 24U; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 5U; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 34U; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 63U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp = src >> 1U; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 28U; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 57U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp = src >> 7U; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 22U; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 51U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp = src >> 13U; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 16U; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 45U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp = src >> 19U; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 10U; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 39U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp = src >> 25U; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 4U; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 33U; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 62U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp = src >> 2U; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 27U; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp = src >> 8U; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 21U; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 50U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp = src >> 14U; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 15U; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 44U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp = src >> 20U; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 9U; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 38U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp = src >> 26U; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 3U; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 32U; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 61U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp = src >> 3U; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 26U; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 55U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp = src >> 9U; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 20U; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 49U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp = src >> 15U; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 14U; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 43U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp = src >> 21U; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 8U; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 37U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp = src >> 27U; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 2U; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 31U; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp = src >> 4U; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 25U; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 54U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp = src >> 10U; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 19U; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp = src >> 16U; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 13U; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 42U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp = src >> 22U; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 7U; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 36U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp = src >> 28U; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 1U; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 30U; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 59U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp = src >> 5U; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 24U; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 53U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp = src >> 11U; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 18U; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 47U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp = src >> 17U; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 12U; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 41U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp = src >> 23U; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 6U; + src = *(in + 16 * 63 + i) - *(a_base_p); + src = src & ((1ULL << 29) - 1); + tmp |= src << 35U; + *(out + i) = tmp; + out -= 448; + } +} +void static ffor_30bit_64ow(const uint64_t* __restrict in, + uint64_t* __restrict out, + const uint64_t* __restrict a_base_p) { + uint64_t tmp = 0U; + uint64_t src; + for (int i = 0; i < 16; i++) { + src = *(in + 16 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp = src; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 30U; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp = src >> 4U; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 26U; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp = src >> 8U; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 22U; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp = src >> 12U; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 18U; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp = src >> 16U; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 14U; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 44U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp = src >> 20U; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 10U; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp = src >> 24U; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 6U; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 36U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp = src >> 28U; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 2U; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 32U; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 62U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp = src >> 2U; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 28U; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 58U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp = src >> 6U; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 24U; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 54U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp = src >> 10U; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 20U; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 50U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp = src >> 14U; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 16U; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 46U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp = src >> 18U; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 12U; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 42U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp = src >> 22U; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 8U; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 38U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp = src >> 26U; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 4U; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 34U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp = src; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 30U; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp = src >> 4U; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 26U; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp = src >> 8U; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 22U; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp = src >> 12U; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 18U; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp = src >> 16U; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 14U; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 44U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp = src >> 20U; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 10U; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp = src >> 24U; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 6U; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 36U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp = src >> 28U; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 2U; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 32U; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 62U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp = src >> 2U; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 28U; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 58U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp = src >> 6U; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 24U; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 54U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp = src >> 10U; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 20U; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 50U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp = src >> 14U; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 16U; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 46U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp = src >> 18U; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 12U; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 42U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp = src >> 22U; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 8U; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 38U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp = src >> 26U; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 4U; + src = *(in + 16 * 63 + i) - *(a_base_p); + src = src & ((1ULL << 30) - 1); + tmp |= src << 34U; + *(out + i) = tmp; + out -= 464; + } +} +void static ffor_31bit_64ow(const uint64_t* __restrict in, + uint64_t* __restrict out, + const uint64_t* __restrict a_base_p) { + uint64_t tmp = 0U; + uint64_t src; + for (int i = 0; i < 16; i++) { + src = *(in + 16 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp = src; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 31U; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 62U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp = src >> 2U; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 29U; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp = src >> 4U; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 27U; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 58U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp = src >> 6U; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 25U; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp = src >> 8U; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 23U; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 54U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp = src >> 10U; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 21U; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp = src >> 12U; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 19U; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 50U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp = src >> 14U; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 17U; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp = src >> 16U; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 15U; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 46U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp = src >> 18U; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 13U; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 44U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp = src >> 20U; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 11U; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 42U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp = src >> 22U; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 9U; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp = src >> 24U; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 7U; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 38U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp = src >> 26U; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 5U; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 36U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp = src >> 28U; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 3U; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 34U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp = src >> 30U; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 1U; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 32U; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 63U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp = src >> 1U; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 30U; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 61U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp = src >> 3U; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 28U; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 59U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp = src >> 5U; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 26U; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 57U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp = src >> 7U; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 24U; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 55U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp = src >> 9U; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 22U; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 53U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp = src >> 11U; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 20U; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 51U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp = src >> 13U; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 18U; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 49U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp = src >> 15U; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 16U; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 47U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp = src >> 17U; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 14U; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 45U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp = src >> 19U; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 12U; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 43U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp = src >> 21U; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 10U; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 41U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp = src >> 23U; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 8U; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 39U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp = src >> 25U; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 6U; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 37U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp = src >> 27U; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 4U; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 35U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp = src >> 29U; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 2U; + src = *(in + 16 * 63 + i) - *(a_base_p); + src = src & ((1ULL << 31) - 1); + tmp |= src << 33U; + *(out + i) = tmp; + out -= 480; + } +} +void static ffor_32bit_64ow(const uint64_t* __restrict in, + uint64_t* __restrict out, + const uint64_t* __restrict a_base_p) { + uint64_t tmp = 0U; + uint64_t src; + for (int i = 0; i < 16; i++) { + src = *(in + 16 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 32) - 1); + tmp = src; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 32) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 32) - 1); + tmp = src; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 32) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 32) - 1); + tmp = src; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 32) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 32) - 1); + tmp = src; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 32) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 32) - 1); + tmp = src; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 32) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 32) - 1); + tmp = src; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 32) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 32) - 1); + tmp = src; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 32) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 32) - 1); + tmp = src; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 32) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 32) - 1); + tmp = src; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 32) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 32) - 1); + tmp = src; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 32) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 32) - 1); + tmp = src; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 32) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 32) - 1); + tmp = src; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 32) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 32) - 1); + tmp = src; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 32) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 32) - 1); + tmp = src; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 32) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 32) - 1); + tmp = src; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 32) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 32) - 1); + tmp = src; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 32) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 32) - 1); + tmp = src; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 32) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 32) - 1); + tmp = src; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 32) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 32) - 1); + tmp = src; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 32) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 32) - 1); + tmp = src; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 32) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 32) - 1); + tmp = src; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 32) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 32) - 1); + tmp = src; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 32) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 32) - 1); + tmp = src; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 32) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 32) - 1); + tmp = src; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 32) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 32) - 1); + tmp = src; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 32) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 32) - 1); + tmp = src; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 32) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 32) - 1); + tmp = src; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 32) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 32) - 1); + tmp = src; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 32) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 32) - 1); + tmp = src; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 32) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 32) - 1); + tmp = src; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 32) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 32) - 1); + tmp = src; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 32) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 32) - 1); + tmp = src; + src = *(in + 16 * 63 + i) - *(a_base_p); + src = src & ((1ULL << 32) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out -= 496; + } +} +void static ffor_33bit_64ow(const uint64_t* __restrict in, + uint64_t* __restrict out, + const uint64_t* __restrict a_base_p) { + uint64_t tmp = 0U; + uint64_t src; + for (int i = 0; i < 16; i++) { + src = *(in + 16 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp = src; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp |= src << 33U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp = src >> 31U; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp |= src << 2U; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp |= src << 35U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp = src >> 29U; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp |= src << 4U; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp |= src << 37U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp = src >> 27U; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp |= src << 6U; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp |= src << 39U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp = src >> 25U; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp |= src << 8U; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp |= src << 41U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp = src >> 23U; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp |= src << 10U; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp |= src << 43U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp = src >> 21U; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp |= src << 12U; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp |= src << 45U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp = src >> 19U; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp |= src << 14U; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp |= src << 47U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp = src >> 17U; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp |= src << 16U; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp |= src << 49U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp = src >> 15U; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp |= src << 18U; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp |= src << 51U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp = src >> 13U; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp |= src << 20U; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp |= src << 53U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp = src >> 11U; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp |= src << 22U; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp |= src << 55U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp = src >> 9U; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp |= src << 24U; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp |= src << 57U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp = src >> 7U; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp |= src << 26U; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp |= src << 59U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp = src >> 5U; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp |= src << 28U; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp |= src << 61U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp = src >> 3U; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp |= src << 30U; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp |= src << 63U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp = src >> 1U; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp = src >> 32U; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp |= src << 1U; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp |= src << 34U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp = src >> 30U; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp |= src << 3U; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp |= src << 36U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp = src >> 28U; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp |= src << 5U; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp |= src << 38U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp = src >> 26U; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp |= src << 7U; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp = src >> 24U; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp |= src << 9U; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp |= src << 42U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp = src >> 22U; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp |= src << 11U; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp |= src << 44U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp = src >> 20U; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp |= src << 13U; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp |= src << 46U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp = src >> 18U; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp |= src << 15U; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp = src >> 16U; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp |= src << 17U; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp |= src << 50U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp = src >> 14U; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp |= src << 19U; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp = src >> 12U; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp |= src << 21U; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp |= src << 54U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp = src >> 10U; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp |= src << 23U; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp = src >> 8U; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp |= src << 25U; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp |= src << 58U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp = src >> 6U; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp |= src << 27U; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp = src >> 4U; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp |= src << 29U; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp |= src << 62U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp = src >> 2U; + src = *(in + 16 * 63 + i) - *(a_base_p); + src = src & ((1ULL << 33) - 1); + tmp |= src << 31U; + *(out + i) = tmp; + out -= 512; + } +} +void static ffor_34bit_64ow(const uint64_t* __restrict in, + uint64_t* __restrict out, + const uint64_t* __restrict a_base_p) { + uint64_t tmp = 0U; + uint64_t src; + for (int i = 0; i < 16; i++) { + src = *(in + 16 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp = src; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp |= src << 34U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp = src >> 30U; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp |= src << 4U; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp |= src << 38U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp = src >> 26U; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp |= src << 8U; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp |= src << 42U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp = src >> 22U; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp |= src << 12U; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp |= src << 46U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp = src >> 18U; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp |= src << 16U; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp |= src << 50U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp = src >> 14U; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp |= src << 20U; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp |= src << 54U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp = src >> 10U; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp |= src << 24U; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp |= src << 58U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp = src >> 6U; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp |= src << 28U; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp |= src << 62U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp = src >> 2U; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp = src >> 32U; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp |= src << 2U; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp |= src << 36U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp = src >> 28U; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp |= src << 6U; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp = src >> 24U; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp |= src << 10U; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp |= src << 44U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp = src >> 20U; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp |= src << 14U; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp = src >> 16U; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp |= src << 18U; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp = src >> 12U; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp |= src << 22U; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp = src >> 8U; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp |= src << 26U; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp = src >> 4U; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp |= src << 30U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp = src; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp |= src << 34U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp = src >> 30U; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp |= src << 4U; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp |= src << 38U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp = src >> 26U; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp |= src << 8U; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp |= src << 42U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp = src >> 22U; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp |= src << 12U; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp |= src << 46U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp = src >> 18U; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp |= src << 16U; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp |= src << 50U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp = src >> 14U; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp |= src << 20U; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp |= src << 54U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp = src >> 10U; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp |= src << 24U; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp |= src << 58U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp = src >> 6U; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp |= src << 28U; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp |= src << 62U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp = src >> 2U; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp = src >> 32U; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp |= src << 2U; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp |= src << 36U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp = src >> 28U; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp |= src << 6U; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp = src >> 24U; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp |= src << 10U; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp |= src << 44U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp = src >> 20U; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp |= src << 14U; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp = src >> 16U; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp |= src << 18U; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp = src >> 12U; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp |= src << 22U; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp = src >> 8U; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp |= src << 26U; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp = src >> 4U; + src = *(in + 16 * 63 + i) - *(a_base_p); + src = src & ((1ULL << 34) - 1); + tmp |= src << 30U; + *(out + i) = tmp; + out -= 528; + } +} +void static ffor_35bit_64ow(const uint64_t* __restrict in, + uint64_t* __restrict out, + const uint64_t* __restrict a_base_p) { + uint64_t tmp = 0U; + uint64_t src; + for (int i = 0; i < 16; i++) { + src = *(in + 16 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp = src; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp |= src << 35U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp = src >> 29U; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp |= src << 6U; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp |= src << 41U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp = src >> 23U; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp |= src << 12U; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp |= src << 47U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp = src >> 17U; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp |= src << 18U; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp |= src << 53U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp = src >> 11U; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp |= src << 24U; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp |= src << 59U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp = src >> 5U; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp |= src << 30U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp = src >> 34U; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp |= src << 1U; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp |= src << 36U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp = src >> 28U; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp |= src << 7U; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp |= src << 42U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp = src >> 22U; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp |= src << 13U; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp = src >> 16U; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp |= src << 19U; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp |= src << 54U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp = src >> 10U; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp |= src << 25U; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp = src >> 4U; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp |= src << 31U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp = src >> 33U; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp |= src << 2U; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp |= src << 37U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp = src >> 27U; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp |= src << 8U; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp |= src << 43U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp = src >> 21U; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp |= src << 14U; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp |= src << 49U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp = src >> 15U; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp |= src << 20U; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp |= src << 55U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp = src >> 9U; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp |= src << 26U; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp |= src << 61U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp = src >> 3U; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp = src >> 32U; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp |= src << 3U; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp |= src << 38U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp = src >> 26U; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp |= src << 9U; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp |= src << 44U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp = src >> 20U; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp |= src << 15U; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp |= src << 50U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp = src >> 14U; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp |= src << 21U; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp = src >> 8U; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp |= src << 27U; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp |= src << 62U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp = src >> 2U; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp |= src << 33U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp = src >> 31U; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp |= src << 4U; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp |= src << 39U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp = src >> 25U; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp |= src << 10U; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp |= src << 45U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp = src >> 19U; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp |= src << 16U; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp |= src << 51U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp = src >> 13U; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp |= src << 22U; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp |= src << 57U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp = src >> 7U; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp |= src << 28U; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp |= src << 63U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp = src >> 1U; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp |= src << 34U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp = src >> 30U; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp |= src << 5U; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp = src >> 24U; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp |= src << 11U; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp |= src << 46U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp = src >> 18U; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp |= src << 17U; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp = src >> 12U; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp |= src << 23U; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp |= src << 58U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp = src >> 6U; + src = *(in + 16 * 63 + i) - *(a_base_p); + src = src & ((1ULL << 35) - 1); + tmp |= src << 29U; + *(out + i) = tmp; + out -= 544; + } +} +void static ffor_36bit_64ow(const uint64_t* __restrict in, + uint64_t* __restrict out, + const uint64_t* __restrict a_base_p) { + uint64_t tmp = 0U; + uint64_t src; + for (int i = 0; i < 16; i++) { + src = *(in + 16 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp = src; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp |= src << 36U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp = src >> 28U; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp |= src << 8U; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp |= src << 44U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp = src >> 20U; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp |= src << 16U; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp = src >> 12U; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp |= src << 24U; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp = src >> 4U; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp = src >> 32U; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp |= src << 4U; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp = src >> 24U; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp |= src << 12U; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp = src >> 16U; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp |= src << 20U; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp = src >> 8U; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp = src; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp |= src << 36U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp = src >> 28U; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp |= src << 8U; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp |= src << 44U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp = src >> 20U; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp |= src << 16U; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp = src >> 12U; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp |= src << 24U; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp = src >> 4U; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp = src >> 32U; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp |= src << 4U; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp = src >> 24U; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp |= src << 12U; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp = src >> 16U; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp |= src << 20U; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp = src >> 8U; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp = src; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp |= src << 36U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp = src >> 28U; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp |= src << 8U; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp |= src << 44U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp = src >> 20U; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp |= src << 16U; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp = src >> 12U; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp |= src << 24U; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp = src >> 4U; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp = src >> 32U; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp |= src << 4U; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp = src >> 24U; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp |= src << 12U; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp = src >> 16U; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp |= src << 20U; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp = src >> 8U; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp = src; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp |= src << 36U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp = src >> 28U; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp |= src << 8U; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp |= src << 44U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp = src >> 20U; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp |= src << 16U; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp = src >> 12U; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp |= src << 24U; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp = src >> 4U; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp = src >> 32U; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp |= src << 4U; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp = src >> 24U; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp |= src << 12U; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp = src >> 16U; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp |= src << 20U; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp = src >> 8U; + src = *(in + 16 * 63 + i) - *(a_base_p); + src = src & ((1ULL << 36) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out -= 560; + } +} +void static ffor_37bit_64ow(const uint64_t* __restrict in, + uint64_t* __restrict out, + const uint64_t* __restrict a_base_p) { + uint64_t tmp = 0U; + uint64_t src; + for (int i = 0; i < 16; i++) { + src = *(in + 16 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp = src; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp |= src << 37U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp = src >> 27U; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp |= src << 10U; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp |= src << 47U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp = src >> 17U; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp |= src << 20U; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp |= src << 57U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp = src >> 7U; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp |= src << 30U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp = src >> 34U; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp |= src << 3U; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp = src >> 24U; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp |= src << 13U; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp |= src << 50U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp = src >> 14U; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp |= src << 23U; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp = src >> 4U; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp |= src << 33U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp = src >> 31U; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp |= src << 6U; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp |= src << 43U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp = src >> 21U; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp |= src << 16U; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp |= src << 53U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp = src >> 11U; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp |= src << 26U; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp |= src << 63U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp = src >> 1U; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp |= src << 36U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp = src >> 28U; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp |= src << 9U; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp |= src << 46U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp = src >> 18U; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp |= src << 19U; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp = src >> 8U; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp |= src << 29U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp = src >> 35U; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp |= src << 2U; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp |= src << 39U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp = src >> 25U; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp |= src << 12U; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp |= src << 49U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp = src >> 15U; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp |= src << 22U; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp |= src << 59U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp = src >> 5U; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp = src >> 32U; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp |= src << 5U; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp |= src << 42U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp = src >> 22U; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp |= src << 15U; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp = src >> 12U; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp |= src << 25U; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp |= src << 62U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp = src >> 2U; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp |= src << 35U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp = src >> 29U; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp |= src << 8U; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp |= src << 45U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp = src >> 19U; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp |= src << 18U; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp |= src << 55U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp = src >> 9U; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp = src >> 36U; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp |= src << 1U; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp |= src << 38U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp = src >> 26U; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp |= src << 11U; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp = src >> 16U; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp |= src << 21U; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp |= src << 58U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp = src >> 6U; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp |= src << 31U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp = src >> 33U; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp |= src << 4U; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp |= src << 41U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp = src >> 23U; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp |= src << 14U; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp |= src << 51U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp = src >> 13U; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp |= src << 24U; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp |= src << 61U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp = src >> 3U; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp |= src << 34U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp = src >> 30U; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp |= src << 7U; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp |= src << 44U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp = src >> 20U; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp |= src << 17U; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp |= src << 54U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp = src >> 10U; + src = *(in + 16 * 63 + i) - *(a_base_p); + src = src & ((1ULL << 37) - 1); + tmp |= src << 27U; + *(out + i) = tmp; + out -= 576; + } +} +void static ffor_38bit_64ow(const uint64_t* __restrict in, + uint64_t* __restrict out, + const uint64_t* __restrict a_base_p) { + uint64_t tmp = 0U; + uint64_t src; + for (int i = 0; i < 16; i++) { + src = *(in + 16 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp = src; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp |= src << 38U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp = src >> 26U; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp |= src << 12U; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp |= src << 50U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp = src >> 14U; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp |= src << 24U; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp |= src << 62U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp = src >> 2U; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp |= src << 36U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp = src >> 28U; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp |= src << 10U; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp = src >> 16U; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp |= src << 22U; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp = src >> 4U; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp |= src << 34U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp = src >> 30U; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp |= src << 8U; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp |= src << 46U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp = src >> 18U; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp |= src << 20U; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp |= src << 58U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp = src >> 6U; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp = src >> 32U; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp |= src << 6U; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp |= src << 44U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp = src >> 20U; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp |= src << 18U; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp = src >> 8U; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp |= src << 30U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp = src >> 34U; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp |= src << 4U; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp |= src << 42U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp = src >> 22U; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp |= src << 16U; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp |= src << 54U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp = src >> 10U; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp = src >> 36U; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp |= src << 2U; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp = src >> 24U; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp |= src << 14U; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp = src >> 12U; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp |= src << 26U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp = src; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp |= src << 38U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp = src >> 26U; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp |= src << 12U; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp |= src << 50U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp = src >> 14U; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp |= src << 24U; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp |= src << 62U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp = src >> 2U; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp |= src << 36U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp = src >> 28U; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp |= src << 10U; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp = src >> 16U; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp |= src << 22U; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp = src >> 4U; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp |= src << 34U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp = src >> 30U; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp |= src << 8U; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp |= src << 46U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp = src >> 18U; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp |= src << 20U; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp |= src << 58U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp = src >> 6U; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp = src >> 32U; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp |= src << 6U; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp |= src << 44U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp = src >> 20U; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp |= src << 18U; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp = src >> 8U; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp |= src << 30U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp = src >> 34U; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp |= src << 4U; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp |= src << 42U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp = src >> 22U; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp |= src << 16U; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp |= src << 54U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp = src >> 10U; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp = src >> 36U; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp |= src << 2U; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp = src >> 24U; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp |= src << 14U; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp = src >> 12U; + src = *(in + 16 * 63 + i) - *(a_base_p); + src = src & ((1ULL << 38) - 1); + tmp |= src << 26U; + *(out + i) = tmp; + out -= 592; + } +} +void static ffor_39bit_64ow(const uint64_t* __restrict in, + uint64_t* __restrict out, + const uint64_t* __restrict a_base_p) { + uint64_t tmp = 0U; + uint64_t src; + for (int i = 0; i < 16; i++) { + src = *(in + 16 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp = src; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp |= src << 39U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp = src >> 25U; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp |= src << 14U; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp |= src << 53U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp = src >> 11U; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp = src >> 36U; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp |= src << 3U; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp |= src << 42U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp = src >> 22U; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp |= src << 17U; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp = src >> 8U; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp |= src << 31U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp = src >> 33U; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp |= src << 6U; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp |= src << 45U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp = src >> 19U; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp |= src << 20U; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp |= src << 59U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp = src >> 5U; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp |= src << 34U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp = src >> 30U; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp |= src << 9U; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp = src >> 16U; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp |= src << 23U; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp |= src << 62U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp = src >> 2U; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp |= src << 37U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp = src >> 27U; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp |= src << 12U; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp |= src << 51U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp = src >> 13U; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp |= src << 26U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp = src >> 38U; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp |= src << 1U; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp = src >> 24U; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp |= src << 15U; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp |= src << 54U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp = src >> 10U; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp |= src << 29U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp = src >> 35U; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp |= src << 4U; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp |= src << 43U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp = src >> 21U; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp |= src << 18U; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp |= src << 57U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp = src >> 7U; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp = src >> 32U; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp |= src << 7U; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp |= src << 46U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp = src >> 18U; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp |= src << 21U; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp = src >> 4U; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp |= src << 35U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp = src >> 29U; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp |= src << 10U; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp |= src << 49U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp = src >> 15U; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp |= src << 24U; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp |= src << 63U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp = src >> 1U; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp |= src << 38U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp = src >> 26U; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp |= src << 13U; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp = src >> 12U; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp |= src << 27U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp = src >> 37U; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp |= src << 2U; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp |= src << 41U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp = src >> 23U; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp |= src << 16U; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp |= src << 55U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp = src >> 9U; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp |= src << 30U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp = src >> 34U; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp |= src << 5U; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp |= src << 44U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp = src >> 20U; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp |= src << 19U; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp |= src << 58U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp = src >> 6U; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp |= src << 33U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp = src >> 31U; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp |= src << 8U; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp |= src << 47U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp = src >> 17U; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp |= src << 22U; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp |= src << 61U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp = src >> 3U; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp |= src << 36U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp = src >> 28U; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp |= src << 11U; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp |= src << 50U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp = src >> 14U; + src = *(in + 16 * 63 + i) - *(a_base_p); + src = src & ((1ULL << 39) - 1); + tmp |= src << 25U; + *(out + i) = tmp; + out -= 608; + } +} +void static ffor_40bit_64ow(const uint64_t* __restrict in, + uint64_t* __restrict out, + const uint64_t* __restrict a_base_p) { + uint64_t tmp = 0U; + uint64_t src; + for (int i = 0; i < 16; i++) { + src = *(in + 16 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp = src; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp = src >> 24U; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp |= src << 16U; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp = src >> 8U; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp = src >> 32U; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp |= src << 8U; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp = src >> 16U; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp = src; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp = src >> 24U; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp |= src << 16U; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp = src >> 8U; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp = src >> 32U; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp |= src << 8U; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp = src >> 16U; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp = src; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp = src >> 24U; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp |= src << 16U; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp = src >> 8U; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp = src >> 32U; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp |= src << 8U; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp = src >> 16U; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp = src; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp = src >> 24U; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp |= src << 16U; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp = src >> 8U; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp = src >> 32U; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp |= src << 8U; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp = src >> 16U; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp = src; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp = src >> 24U; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp |= src << 16U; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp = src >> 8U; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp = src >> 32U; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp |= src << 8U; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp = src >> 16U; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp = src; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp = src >> 24U; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp |= src << 16U; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp = src >> 8U; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp = src >> 32U; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp |= src << 8U; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp = src >> 16U; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp = src; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp = src >> 24U; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp |= src << 16U; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp = src >> 8U; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp = src >> 32U; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp |= src << 8U; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp = src >> 16U; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp = src; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp = src >> 24U; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp |= src << 16U; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp = src >> 8U; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp = src >> 32U; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp |= src << 8U; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp = src >> 16U; + src = *(in + 16 * 63 + i) - *(a_base_p); + src = src & ((1ULL << 40) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out -= 624; + } +} +void static ffor_41bit_64ow(const uint64_t* __restrict in, + uint64_t* __restrict out, + const uint64_t* __restrict a_base_p) { + uint64_t tmp = 0U; + uint64_t src; + for (int i = 0; i < 16; i++) { + src = *(in + 16 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp = src; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp |= src << 41U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp = src >> 23U; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp |= src << 18U; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp |= src << 59U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp = src >> 5U; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp |= src << 36U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp = src >> 28U; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp |= src << 13U; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp |= src << 54U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp = src >> 10U; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp |= src << 31U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp = src >> 33U; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp |= src << 8U; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp |= src << 49U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp = src >> 15U; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp |= src << 26U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp = src >> 38U; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp |= src << 3U; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp |= src << 44U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp = src >> 20U; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp |= src << 21U; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp |= src << 62U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp = src >> 2U; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp |= src << 39U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp = src >> 25U; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp |= src << 16U; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp |= src << 57U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp = src >> 7U; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp |= src << 34U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp = src >> 30U; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp |= src << 11U; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp = src >> 12U; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp |= src << 29U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp = src >> 35U; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp |= src << 6U; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp |= src << 47U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp = src >> 17U; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp = src >> 40U; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp |= src << 1U; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp |= src << 42U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp = src >> 22U; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp |= src << 19U; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp = src >> 4U; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp |= src << 37U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp = src >> 27U; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp |= src << 14U; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp |= src << 55U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp = src >> 9U; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp = src >> 32U; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp |= src << 9U; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp |= src << 50U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp = src >> 14U; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp |= src << 27U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp = src >> 37U; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp |= src << 4U; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp |= src << 45U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp = src >> 19U; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp |= src << 22U; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp |= src << 63U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp = src >> 1U; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp = src >> 24U; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp |= src << 17U; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp |= src << 58U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp = src >> 6U; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp |= src << 35U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp = src >> 29U; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp |= src << 12U; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp |= src << 53U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp = src >> 11U; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp |= src << 30U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp = src >> 34U; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp |= src << 7U; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp = src >> 16U; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp |= src << 25U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp = src >> 39U; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp |= src << 2U; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp |= src << 43U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp = src >> 21U; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp |= src << 20U; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp |= src << 61U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp = src >> 3U; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp |= src << 38U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp = src >> 26U; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp |= src << 15U; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp = src >> 8U; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp |= src << 33U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp = src >> 31U; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp |= src << 10U; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp |= src << 51U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp = src >> 13U; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp = src >> 36U; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp |= src << 5U; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp |= src << 46U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp = src >> 18U; + src = *(in + 16 * 63 + i) - *(a_base_p); + src = src & ((1ULL << 41) - 1); + tmp |= src << 23U; + *(out + i) = tmp; + out -= 640; + } +} +void static ffor_42bit_64ow(const uint64_t* __restrict in, + uint64_t* __restrict out, + const uint64_t* __restrict a_base_p) { + uint64_t tmp = 0U; + uint64_t src; + for (int i = 0; i < 16; i++) { + src = *(in + 16 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp = src; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp |= src << 42U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp = src >> 22U; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp |= src << 20U; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp |= src << 62U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp = src >> 2U; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp = src >> 24U; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp |= src << 18U; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp = src >> 4U; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp |= src << 38U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp = src >> 26U; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp |= src << 16U; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp |= src << 58U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp = src >> 6U; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp |= src << 36U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp = src >> 28U; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp |= src << 14U; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp = src >> 8U; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp |= src << 34U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp = src >> 30U; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp |= src << 12U; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp |= src << 54U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp = src >> 10U; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp = src >> 32U; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp |= src << 10U; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp = src >> 12U; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp |= src << 30U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp = src >> 34U; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp |= src << 8U; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp |= src << 50U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp = src >> 14U; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp = src >> 36U; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp |= src << 6U; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp = src >> 16U; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp |= src << 26U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp = src >> 38U; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp |= src << 4U; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp |= src << 46U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp = src >> 18U; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp = src >> 40U; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp |= src << 2U; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp |= src << 44U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp = src >> 20U; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp |= src << 22U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp = src; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp |= src << 42U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp = src >> 22U; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp |= src << 20U; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp |= src << 62U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp = src >> 2U; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp = src >> 24U; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp |= src << 18U; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp = src >> 4U; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp |= src << 38U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp = src >> 26U; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp |= src << 16U; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp |= src << 58U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp = src >> 6U; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp |= src << 36U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp = src >> 28U; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp |= src << 14U; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp = src >> 8U; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp |= src << 34U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp = src >> 30U; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp |= src << 12U; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp |= src << 54U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp = src >> 10U; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp = src >> 32U; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp |= src << 10U; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp = src >> 12U; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp |= src << 30U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp = src >> 34U; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp |= src << 8U; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp |= src << 50U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp = src >> 14U; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp = src >> 36U; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp |= src << 6U; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp = src >> 16U; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp |= src << 26U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp = src >> 38U; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp |= src << 4U; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp |= src << 46U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp = src >> 18U; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp = src >> 40U; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp |= src << 2U; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp |= src << 44U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp = src >> 20U; + src = *(in + 16 * 63 + i) - *(a_base_p); + src = src & ((1ULL << 42) - 1); + tmp |= src << 22U; + *(out + i) = tmp; + out -= 656; + } +} +void static ffor_43bit_64ow(const uint64_t* __restrict in, + uint64_t* __restrict out, + const uint64_t* __restrict a_base_p) { + uint64_t tmp = 0U; + uint64_t src; + for (int i = 0; i < 16; i++) { + src = *(in + 16 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp = src; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp |= src << 43U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp = src >> 21U; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp |= src << 22U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp = src >> 42U; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp |= src << 1U; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp |= src << 44U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp = src >> 20U; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp |= src << 23U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp = src >> 41U; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp |= src << 2U; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp |= src << 45U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp = src >> 19U; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp = src >> 40U; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp |= src << 3U; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp |= src << 46U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp = src >> 18U; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp |= src << 25U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp = src >> 39U; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp |= src << 4U; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp |= src << 47U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp = src >> 17U; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp |= src << 26U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp = src >> 38U; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp |= src << 5U; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp = src >> 16U; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp |= src << 27U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp = src >> 37U; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp |= src << 6U; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp |= src << 49U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp = src >> 15U; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp = src >> 36U; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp |= src << 7U; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp |= src << 50U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp = src >> 14U; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp |= src << 29U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp = src >> 35U; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp |= src << 8U; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp |= src << 51U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp = src >> 13U; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp |= src << 30U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp = src >> 34U; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp |= src << 9U; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp = src >> 12U; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp |= src << 31U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp = src >> 33U; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp |= src << 10U; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp |= src << 53U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp = src >> 11U; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp = src >> 32U; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp |= src << 11U; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp |= src << 54U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp = src >> 10U; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp |= src << 33U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp = src >> 31U; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp |= src << 12U; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp |= src << 55U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp = src >> 9U; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp |= src << 34U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp = src >> 30U; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp |= src << 13U; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp = src >> 8U; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp |= src << 35U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp = src >> 29U; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp |= src << 14U; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp |= src << 57U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp = src >> 7U; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp |= src << 36U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp = src >> 28U; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp |= src << 15U; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp |= src << 58U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp = src >> 6U; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp |= src << 37U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp = src >> 27U; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp |= src << 16U; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp |= src << 59U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp = src >> 5U; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp |= src << 38U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp = src >> 26U; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp |= src << 17U; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp = src >> 4U; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp |= src << 39U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp = src >> 25U; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp |= src << 18U; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp |= src << 61U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp = src >> 3U; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp = src >> 24U; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp |= src << 19U; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp |= src << 62U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp = src >> 2U; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp |= src << 41U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp = src >> 23U; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp |= src << 20U; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp |= src << 63U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp = src >> 1U; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp |= src << 42U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp = src >> 22U; + src = *(in + 16 * 63 + i) - *(a_base_p); + src = src & ((1ULL << 43) - 1); + tmp |= src << 21U; + *(out + i) = tmp; + out -= 672; + } +} +void static ffor_44bit_64ow(const uint64_t* __restrict in, + uint64_t* __restrict out, + const uint64_t* __restrict a_base_p) { + uint64_t tmp = 0U; + uint64_t src; + for (int i = 0; i < 16; i++) { + src = *(in + 16 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp = src; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp |= src << 44U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp = src >> 20U; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp = src >> 40U; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp |= src << 4U; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp = src >> 16U; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp = src >> 36U; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp |= src << 8U; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp = src >> 12U; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp = src >> 32U; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp |= src << 12U; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp = src >> 8U; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp |= src << 36U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp = src >> 28U; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp |= src << 16U; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp = src >> 4U; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp = src >> 24U; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp |= src << 20U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp = src; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp |= src << 44U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp = src >> 20U; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp = src >> 40U; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp |= src << 4U; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp = src >> 16U; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp = src >> 36U; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp |= src << 8U; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp = src >> 12U; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp = src >> 32U; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp |= src << 12U; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp = src >> 8U; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp |= src << 36U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp = src >> 28U; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp |= src << 16U; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp = src >> 4U; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp = src >> 24U; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp |= src << 20U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp = src; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp |= src << 44U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp = src >> 20U; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp = src >> 40U; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp |= src << 4U; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp = src >> 16U; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp = src >> 36U; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp |= src << 8U; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp = src >> 12U; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp = src >> 32U; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp |= src << 12U; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp = src >> 8U; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp |= src << 36U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp = src >> 28U; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp |= src << 16U; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp = src >> 4U; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp = src >> 24U; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp |= src << 20U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp = src; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp |= src << 44U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp = src >> 20U; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp = src >> 40U; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp |= src << 4U; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp = src >> 16U; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp = src >> 36U; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp |= src << 8U; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp = src >> 12U; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp = src >> 32U; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp |= src << 12U; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp = src >> 8U; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp |= src << 36U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp = src >> 28U; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp |= src << 16U; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp = src >> 4U; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp = src >> 24U; + src = *(in + 16 * 63 + i) - *(a_base_p); + src = src & ((1ULL << 44) - 1); + tmp |= src << 20U; + *(out + i) = tmp; + out -= 688; + } +} +void static ffor_45bit_64ow(const uint64_t* __restrict in, + uint64_t* __restrict out, + const uint64_t* __restrict a_base_p) { + uint64_t tmp = 0U; + uint64_t src; + for (int i = 0; i < 16; i++) { + src = *(in + 16 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp = src; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp |= src << 45U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp = src >> 19U; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp |= src << 26U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp = src >> 38U; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp |= src << 7U; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp = src >> 12U; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp |= src << 33U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp = src >> 31U; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp |= src << 14U; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp |= src << 59U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp = src >> 5U; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp = src >> 24U; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp |= src << 21U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp = src >> 43U; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp |= src << 2U; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp |= src << 47U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp = src >> 17U; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp = src >> 36U; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp |= src << 9U; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp |= src << 54U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp = src >> 10U; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp |= src << 35U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp = src >> 29U; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp |= src << 16U; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp |= src << 61U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp = src >> 3U; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp |= src << 42U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp = src >> 22U; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp |= src << 23U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp = src >> 41U; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp |= src << 4U; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp |= src << 49U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp = src >> 15U; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp |= src << 30U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp = src >> 34U; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp |= src << 11U; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp = src >> 8U; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp |= src << 37U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp = src >> 27U; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp |= src << 18U; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp |= src << 63U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp = src >> 1U; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp |= src << 44U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp = src >> 20U; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp |= src << 25U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp = src >> 39U; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp |= src << 6U; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp |= src << 51U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp = src >> 13U; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp = src >> 32U; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp |= src << 13U; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp |= src << 58U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp = src >> 6U; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp |= src << 39U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp = src >> 25U; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp |= src << 20U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp = src >> 44U; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp |= src << 1U; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp |= src << 46U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp = src >> 18U; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp |= src << 27U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp = src >> 37U; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp |= src << 8U; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp |= src << 53U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp = src >> 11U; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp |= src << 34U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp = src >> 30U; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp |= src << 15U; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp = src >> 4U; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp |= src << 41U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp = src >> 23U; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp |= src << 22U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp = src >> 42U; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp |= src << 3U; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp = src >> 16U; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp |= src << 29U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp = src >> 35U; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp |= src << 10U; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp |= src << 55U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp = src >> 9U; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp |= src << 36U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp = src >> 28U; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp |= src << 17U; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp |= src << 62U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp = src >> 2U; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp |= src << 43U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp = src >> 21U; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp = src >> 40U; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp |= src << 5U; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp |= src << 50U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp = src >> 14U; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp |= src << 31U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp = src >> 33U; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp |= src << 12U; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp |= src << 57U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp = src >> 7U; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp |= src << 38U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp = src >> 26U; + src = *(in + 16 * 63 + i) - *(a_base_p); + src = src & ((1ULL << 45) - 1); + tmp |= src << 19U; + *(out + i) = tmp; + out -= 704; + } +} +void static ffor_46bit_64ow(const uint64_t* __restrict in, + uint64_t* __restrict out, + const uint64_t* __restrict a_base_p) { + uint64_t tmp = 0U; + uint64_t src; + for (int i = 0; i < 16; i++) { + src = *(in + 16 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp = src; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp |= src << 46U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp = src >> 18U; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp = src >> 36U; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp |= src << 10U; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp = src >> 8U; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp |= src << 38U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp = src >> 26U; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp |= src << 20U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp = src >> 44U; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp |= src << 2U; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp = src >> 16U; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp |= src << 30U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp = src >> 34U; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp |= src << 12U; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp |= src << 58U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp = src >> 6U; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp = src >> 24U; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp |= src << 22U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp = src >> 42U; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp |= src << 4U; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp |= src << 50U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp = src >> 14U; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp = src >> 32U; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp |= src << 14U; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp = src >> 4U; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp |= src << 42U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp = src >> 22U; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp = src >> 40U; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp |= src << 6U; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp = src >> 12U; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp |= src << 34U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp = src >> 30U; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp |= src << 16U; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp |= src << 62U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp = src >> 2U; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp |= src << 44U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp = src >> 20U; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp |= src << 26U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp = src >> 38U; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp |= src << 8U; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp |= src << 54U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp = src >> 10U; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp |= src << 36U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp = src >> 28U; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp |= src << 18U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp = src; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp |= src << 46U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp = src >> 18U; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp = src >> 36U; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp |= src << 10U; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp = src >> 8U; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp |= src << 38U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp = src >> 26U; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp |= src << 20U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp = src >> 44U; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp |= src << 2U; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp = src >> 16U; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp |= src << 30U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp = src >> 34U; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp |= src << 12U; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp |= src << 58U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp = src >> 6U; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp = src >> 24U; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp |= src << 22U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp = src >> 42U; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp |= src << 4U; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp |= src << 50U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp = src >> 14U; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp = src >> 32U; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp |= src << 14U; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp = src >> 4U; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp |= src << 42U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp = src >> 22U; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp = src >> 40U; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp |= src << 6U; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp = src >> 12U; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp |= src << 34U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp = src >> 30U; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp |= src << 16U; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp |= src << 62U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp = src >> 2U; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp |= src << 44U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp = src >> 20U; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp |= src << 26U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp = src >> 38U; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp |= src << 8U; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp |= src << 54U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp = src >> 10U; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp |= src << 36U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp = src >> 28U; + src = *(in + 16 * 63 + i) - *(a_base_p); + src = src & ((1ULL << 46) - 1); + tmp |= src << 18U; + *(out + i) = tmp; + out -= 720; + } +} +void static ffor_47bit_64ow(const uint64_t* __restrict in, + uint64_t* __restrict out, + const uint64_t* __restrict a_base_p) { + uint64_t tmp = 0U; + uint64_t src; + for (int i = 0; i < 16; i++) { + src = *(in + 16 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp = src; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp |= src << 47U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp = src >> 17U; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp |= src << 30U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp = src >> 34U; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp |= src << 13U; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp = src >> 4U; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp |= src << 43U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp = src >> 21U; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp |= src << 26U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp = src >> 38U; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp |= src << 9U; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp = src >> 8U; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp |= src << 39U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp = src >> 25U; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp |= src << 22U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp = src >> 42U; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp |= src << 5U; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp = src >> 12U; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp |= src << 35U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp = src >> 29U; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp |= src << 18U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp = src >> 46U; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp |= src << 1U; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp = src >> 16U; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp |= src << 31U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp = src >> 33U; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp |= src << 14U; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp |= src << 61U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp = src >> 3U; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp |= src << 44U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp = src >> 20U; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp |= src << 27U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp = src >> 37U; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp |= src << 10U; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp |= src << 57U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp = src >> 7U; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp = src >> 24U; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp |= src << 23U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp = src >> 41U; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp |= src << 6U; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp |= src << 53U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp = src >> 11U; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp |= src << 36U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp = src >> 28U; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp |= src << 19U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp = src >> 45U; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp |= src << 2U; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp |= src << 49U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp = src >> 15U; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp = src >> 32U; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp |= src << 15U; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp |= src << 62U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp = src >> 2U; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp |= src << 45U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp = src >> 19U; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp = src >> 36U; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp |= src << 11U; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp |= src << 58U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp = src >> 6U; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp |= src << 41U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp = src >> 23U; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp = src >> 40U; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp |= src << 7U; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp |= src << 54U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp = src >> 10U; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp |= src << 37U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp = src >> 27U; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp |= src << 20U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp = src >> 44U; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp |= src << 3U; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp |= src << 50U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp = src >> 14U; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp |= src << 33U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp = src >> 31U; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp |= src << 16U; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp |= src << 63U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp = src >> 1U; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp |= src << 46U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp = src >> 18U; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp |= src << 29U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp = src >> 35U; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp |= src << 12U; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp |= src << 59U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp = src >> 5U; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp |= src << 42U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp = src >> 22U; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp |= src << 25U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp = src >> 39U; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp |= src << 8U; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp |= src << 55U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp = src >> 9U; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp |= src << 38U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp = src >> 26U; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp |= src << 21U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp = src >> 43U; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp |= src << 4U; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp |= src << 51U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp = src >> 13U; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp |= src << 34U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp = src >> 30U; + src = *(in + 16 * 63 + i) - *(a_base_p); + src = src & ((1ULL << 47) - 1); + tmp |= src << 17U; + *(out + i) = tmp; + out -= 736; + } +} +void static ffor_48bit_64ow(const uint64_t* __restrict in, + uint64_t* __restrict out, + const uint64_t* __restrict a_base_p) { + uint64_t tmp = 0U; + uint64_t src; + for (int i = 0; i < 16; i++) { + src = *(in + 16 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp = src; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp = src >> 16U; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp = src >> 32U; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp = src; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp = src >> 16U; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp = src >> 32U; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp = src; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp = src >> 16U; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp = src >> 32U; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp = src; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp = src >> 16U; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp = src >> 32U; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp = src; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp = src >> 16U; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp = src >> 32U; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp = src; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp = src >> 16U; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp = src >> 32U; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp = src; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp = src >> 16U; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp = src >> 32U; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp = src; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp = src >> 16U; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp = src >> 32U; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp = src; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp = src >> 16U; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp = src >> 32U; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp = src; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp = src >> 16U; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp = src >> 32U; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp = src; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp = src >> 16U; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp = src >> 32U; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp = src; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp = src >> 16U; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp = src >> 32U; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp = src; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp = src >> 16U; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp = src >> 32U; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp = src; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp = src >> 16U; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp = src >> 32U; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp = src; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp = src >> 16U; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp = src >> 32U; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp = src; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp = src >> 16U; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp = src >> 32U; + src = *(in + 16 * 63 + i) - *(a_base_p); + src = src & ((1ULL << 48) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out -= 752; + } +} +void static ffor_49bit_64ow(const uint64_t* __restrict in, + uint64_t* __restrict out, + const uint64_t* __restrict a_base_p) { + uint64_t tmp = 0U; + uint64_t src; + for (int i = 0; i < 16; i++) { + src = *(in + 16 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp = src; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp |= src << 49U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp = src >> 15U; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp |= src << 34U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp = src >> 30U; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp |= src << 19U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp = src >> 45U; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp |= src << 4U; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp |= src << 53U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp = src >> 11U; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp |= src << 38U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp = src >> 26U; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp |= src << 23U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp = src >> 41U; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp |= src << 8U; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp |= src << 57U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp = src >> 7U; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp |= src << 42U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp = src >> 22U; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp |= src << 27U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp = src >> 37U; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp |= src << 12U; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp |= src << 61U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp = src >> 3U; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp |= src << 46U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp = src >> 18U; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp |= src << 31U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp = src >> 33U; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp = src >> 48U; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp |= src << 1U; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp |= src << 50U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp = src >> 14U; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp |= src << 35U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp = src >> 29U; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp |= src << 20U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp = src >> 44U; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp |= src << 5U; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp |= src << 54U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp = src >> 10U; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp |= src << 39U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp = src >> 25U; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp = src >> 40U; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp |= src << 9U; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp |= src << 58U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp = src >> 6U; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp |= src << 43U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp = src >> 21U; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp = src >> 36U; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp |= src << 13U; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp |= src << 62U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp = src >> 2U; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp |= src << 47U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp = src >> 17U; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp = src >> 32U; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp |= src << 17U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp = src >> 47U; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp |= src << 2U; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp |= src << 51U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp = src >> 13U; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp |= src << 36U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp = src >> 28U; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp |= src << 21U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp = src >> 43U; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp |= src << 6U; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp |= src << 55U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp = src >> 9U; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp = src >> 24U; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp |= src << 25U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp = src >> 39U; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp |= src << 10U; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp |= src << 59U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp = src >> 5U; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp |= src << 44U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp = src >> 20U; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp |= src << 29U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp = src >> 35U; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp |= src << 14U; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp |= src << 63U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp = src >> 1U; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp = src >> 16U; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp |= src << 33U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp = src >> 31U; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp |= src << 18U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp = src >> 46U; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp |= src << 3U; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp = src >> 12U; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp |= src << 37U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp = src >> 27U; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp |= src << 22U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp = src >> 42U; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp |= src << 7U; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp = src >> 8U; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp |= src << 41U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp = src >> 23U; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp |= src << 26U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp = src >> 38U; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp |= src << 11U; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp = src >> 4U; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp |= src << 45U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp = src >> 19U; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp |= src << 30U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp = src >> 34U; + src = *(in + 16 * 63 + i) - *(a_base_p); + src = src & ((1ULL << 49) - 1); + tmp |= src << 15U; + *(out + i) = tmp; + out -= 768; + } +} +void static ffor_50bit_64ow(const uint64_t* __restrict in, + uint64_t* __restrict out, + const uint64_t* __restrict a_base_p) { + uint64_t tmp = 0U; + uint64_t src; + for (int i = 0; i < 16; i++) { + src = *(in + 16 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp = src; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp |= src << 50U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp = src >> 14U; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp |= src << 36U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp = src >> 28U; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp |= src << 22U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp = src >> 42U; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp |= src << 8U; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp |= src << 58U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp = src >> 6U; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp |= src << 44U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp = src >> 20U; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp |= src << 30U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp = src >> 34U; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp = src >> 48U; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp |= src << 2U; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp = src >> 12U; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp |= src << 38U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp = src >> 26U; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp = src >> 40U; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp |= src << 10U; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp = src >> 4U; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp |= src << 46U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp = src >> 18U; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp = src >> 32U; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp |= src << 18U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp = src >> 46U; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp |= src << 4U; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp |= src << 54U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp = src >> 10U; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp = src >> 24U; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp |= src << 26U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp = src >> 38U; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp |= src << 12U; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp |= src << 62U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp = src >> 2U; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp = src >> 16U; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp |= src << 34U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp = src >> 30U; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp |= src << 20U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp = src >> 44U; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp |= src << 6U; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp = src >> 8U; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp |= src << 42U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp = src >> 22U; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp = src >> 36U; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp |= src << 14U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp = src; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp |= src << 50U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp = src >> 14U; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp |= src << 36U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp = src >> 28U; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp |= src << 22U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp = src >> 42U; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp |= src << 8U; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp |= src << 58U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp = src >> 6U; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp |= src << 44U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp = src >> 20U; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp |= src << 30U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp = src >> 34U; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp = src >> 48U; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp |= src << 2U; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp = src >> 12U; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp |= src << 38U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp = src >> 26U; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp = src >> 40U; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp |= src << 10U; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp = src >> 4U; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp |= src << 46U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp = src >> 18U; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp = src >> 32U; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp |= src << 18U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp = src >> 46U; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp |= src << 4U; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp |= src << 54U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp = src >> 10U; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp = src >> 24U; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp |= src << 26U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp = src >> 38U; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp |= src << 12U; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp |= src << 62U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp = src >> 2U; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp = src >> 16U; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp |= src << 34U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp = src >> 30U; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp |= src << 20U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp = src >> 44U; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp |= src << 6U; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp = src >> 8U; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp |= src << 42U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp = src >> 22U; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp = src >> 36U; + src = *(in + 16 * 63 + i) - *(a_base_p); + src = src & ((1ULL << 50) - 1); + tmp |= src << 14U; + *(out + i) = tmp; + out -= 784; + } +} +void static ffor_51bit_64ow(const uint64_t* __restrict in, + uint64_t* __restrict out, + const uint64_t* __restrict a_base_p) { + uint64_t tmp = 0U; + uint64_t src; + for (int i = 0; i < 16; i++) { + src = *(in + 16 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp = src; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp |= src << 51U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp = src >> 13U; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp |= src << 38U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp = src >> 26U; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp |= src << 25U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp = src >> 39U; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp |= src << 12U; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp |= src << 63U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp = src >> 1U; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp |= src << 50U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp = src >> 14U; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp |= src << 37U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp = src >> 27U; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp = src >> 40U; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp |= src << 11U; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp |= src << 62U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp = src >> 2U; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp |= src << 49U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp = src >> 15U; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp |= src << 36U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp = src >> 28U; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp |= src << 23U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp = src >> 41U; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp |= src << 10U; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp |= src << 61U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp = src >> 3U; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp = src >> 16U; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp |= src << 35U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp = src >> 29U; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp |= src << 22U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp = src >> 42U; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp |= src << 9U; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp = src >> 4U; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp |= src << 47U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp = src >> 17U; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp |= src << 34U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp = src >> 30U; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp |= src << 21U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp = src >> 43U; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp |= src << 8U; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp |= src << 59U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp = src >> 5U; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp |= src << 46U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp = src >> 18U; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp |= src << 33U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp = src >> 31U; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp |= src << 20U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp = src >> 44U; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp |= src << 7U; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp |= src << 58U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp = src >> 6U; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp |= src << 45U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp = src >> 19U; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp = src >> 32U; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp |= src << 19U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp = src >> 45U; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp |= src << 6U; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp |= src << 57U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp = src >> 7U; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp |= src << 44U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp = src >> 20U; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp |= src << 31U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp = src >> 33U; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp |= src << 18U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp = src >> 46U; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp |= src << 5U; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp = src >> 8U; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp |= src << 43U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp = src >> 21U; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp |= src << 30U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp = src >> 34U; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp |= src << 17U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp = src >> 47U; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp |= src << 4U; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp |= src << 55U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp = src >> 9U; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp |= src << 42U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp = src >> 22U; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp |= src << 29U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp = src >> 35U; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp = src >> 48U; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp |= src << 3U; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp |= src << 54U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp = src >> 10U; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp |= src << 41U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp = src >> 23U; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp = src >> 36U; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp |= src << 15U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp = src >> 49U; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp |= src << 2U; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp |= src << 53U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp = src >> 11U; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp = src >> 24U; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp |= src << 27U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp = src >> 37U; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp |= src << 14U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp = src >> 50U; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp |= src << 1U; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp = src >> 12U; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp |= src << 39U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp = src >> 25U; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp |= src << 26U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp = src >> 38U; + src = *(in + 16 * 63 + i) - *(a_base_p); + src = src & ((1ULL << 51) - 1); + tmp |= src << 13U; + *(out + i) = tmp; + out -= 800; + } +} +void static ffor_52bit_64ow(const uint64_t* __restrict in, + uint64_t* __restrict out, + const uint64_t* __restrict a_base_p) { + uint64_t tmp = 0U; + uint64_t src; + for (int i = 0; i < 16; i++) { + src = *(in + 16 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp = src; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp = src >> 12U; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp = src >> 24U; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp = src >> 36U; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp = src >> 48U; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp |= src << 4U; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp = src >> 8U; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp |= src << 44U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp = src >> 20U; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp = src >> 32U; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp |= src << 20U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp = src >> 44U; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp |= src << 8U; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp = src >> 4U; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp = src >> 16U; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp |= src << 36U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp = src >> 28U; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp = src >> 40U; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp |= src << 12U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp = src; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp = src >> 12U; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp = src >> 24U; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp = src >> 36U; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp = src >> 48U; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp |= src << 4U; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp = src >> 8U; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp |= src << 44U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp = src >> 20U; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp = src >> 32U; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp |= src << 20U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp = src >> 44U; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp |= src << 8U; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp = src >> 4U; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp = src >> 16U; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp |= src << 36U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp = src >> 28U; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp = src >> 40U; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp |= src << 12U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp = src; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp = src >> 12U; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp = src >> 24U; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp = src >> 36U; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp = src >> 48U; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp |= src << 4U; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp = src >> 8U; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp |= src << 44U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp = src >> 20U; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp = src >> 32U; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp |= src << 20U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp = src >> 44U; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp |= src << 8U; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp = src >> 4U; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp = src >> 16U; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp |= src << 36U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp = src >> 28U; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp = src >> 40U; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp |= src << 12U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp = src; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp = src >> 12U; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp = src >> 24U; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp = src >> 36U; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp = src >> 48U; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp |= src << 4U; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp = src >> 8U; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp |= src << 44U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp = src >> 20U; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp = src >> 32U; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp |= src << 20U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp = src >> 44U; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp |= src << 8U; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp = src >> 4U; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp = src >> 16U; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp |= src << 36U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp = src >> 28U; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp = src >> 40U; + src = *(in + 16 * 63 + i) - *(a_base_p); + src = src & ((1ULL << 52) - 1); + tmp |= src << 12U; + *(out + i) = tmp; + out -= 816; + } +} +void static ffor_53bit_64ow(const uint64_t* __restrict in, + uint64_t* __restrict out, + const uint64_t* __restrict a_base_p) { + uint64_t tmp = 0U; + uint64_t src; + for (int i = 0; i < 16; i++) { + src = *(in + 16 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp = src; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp |= src << 53U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp = src >> 11U; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp |= src << 42U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp = src >> 22U; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp |= src << 31U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp = src >> 33U; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp |= src << 20U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp = src >> 44U; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp |= src << 9U; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp |= src << 62U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp = src >> 2U; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp |= src << 51U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp = src >> 13U; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp = src >> 24U; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp |= src << 29U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp = src >> 35U; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp |= src << 18U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp = src >> 46U; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp |= src << 7U; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp = src >> 4U; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp |= src << 49U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp = src >> 15U; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp |= src << 38U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp = src >> 26U; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp |= src << 27U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp = src >> 37U; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp = src >> 48U; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp |= src << 5U; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp |= src << 58U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp = src >> 6U; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp |= src << 47U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp = src >> 17U; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp |= src << 36U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp = src >> 28U; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp |= src << 25U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp = src >> 39U; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp |= src << 14U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp = src >> 50U; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp |= src << 3U; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp = src >> 8U; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp |= src << 45U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp = src >> 19U; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp |= src << 34U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp = src >> 30U; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp |= src << 23U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp = src >> 41U; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp |= src << 12U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp = src >> 52U; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp |= src << 1U; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp |= src << 54U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp = src >> 10U; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp |= src << 43U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp = src >> 21U; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp = src >> 32U; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp |= src << 21U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp = src >> 43U; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp |= src << 10U; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp |= src << 63U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp = src >> 1U; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp = src >> 12U; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp |= src << 41U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp = src >> 23U; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp |= src << 30U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp = src >> 34U; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp |= src << 19U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp = src >> 45U; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp |= src << 8U; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp |= src << 61U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp = src >> 3U; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp |= src << 50U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp = src >> 14U; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp |= src << 39U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp = src >> 25U; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp = src >> 36U; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp |= src << 17U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp = src >> 47U; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp |= src << 6U; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp |= src << 59U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp = src >> 5U; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp = src >> 16U; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp |= src << 37U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp = src >> 27U; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp |= src << 26U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp = src >> 38U; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp |= src << 15U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp = src >> 49U; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp |= src << 4U; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp |= src << 57U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp = src >> 7U; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp |= src << 46U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp = src >> 18U; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp |= src << 35U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp = src >> 29U; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp = src >> 40U; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp |= src << 13U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp = src >> 51U; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp |= src << 2U; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp |= src << 55U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp = src >> 9U; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp |= src << 44U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp = src >> 20U; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp |= src << 33U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp = src >> 31U; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp |= src << 22U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp = src >> 42U; + src = *(in + 16 * 63 + i) - *(a_base_p); + src = src & ((1ULL << 53) - 1); + tmp |= src << 11U; + *(out + i) = tmp; + out -= 832; + } +} +void static ffor_54bit_64ow(const uint64_t* __restrict in, + uint64_t* __restrict out, + const uint64_t* __restrict a_base_p) { + uint64_t tmp = 0U; + uint64_t src; + for (int i = 0; i < 16; i++) { + src = *(in + 16 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp = src; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp |= src << 54U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp = src >> 10U; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp |= src << 44U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp = src >> 20U; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp |= src << 34U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp = src >> 30U; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp = src >> 40U; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp |= src << 14U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp = src >> 50U; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp |= src << 4U; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp |= src << 58U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp = src >> 6U; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp = src >> 16U; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp |= src << 38U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp = src >> 26U; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp = src >> 36U; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp |= src << 18U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp = src >> 46U; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp |= src << 8U; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp |= src << 62U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp = src >> 2U; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp = src >> 12U; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp |= src << 42U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp = src >> 22U; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp = src >> 32U; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp |= src << 22U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp = src >> 42U; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp |= src << 12U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp = src >> 52U; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp |= src << 2U; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp = src >> 8U; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp |= src << 46U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp = src >> 18U; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp |= src << 36U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp = src >> 28U; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp |= src << 26U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp = src >> 38U; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp = src >> 48U; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp |= src << 6U; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp = src >> 4U; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp |= src << 50U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp = src >> 14U; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp = src >> 24U; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp |= src << 30U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp = src >> 34U; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp |= src << 20U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp = src >> 44U; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp |= src << 10U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp = src; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp |= src << 54U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp = src >> 10U; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp |= src << 44U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp = src >> 20U; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp |= src << 34U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp = src >> 30U; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp = src >> 40U; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp |= src << 14U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp = src >> 50U; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp |= src << 4U; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp |= src << 58U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp = src >> 6U; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp = src >> 16U; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp |= src << 38U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp = src >> 26U; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp = src >> 36U; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp |= src << 18U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp = src >> 46U; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp |= src << 8U; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp |= src << 62U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp = src >> 2U; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp = src >> 12U; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp |= src << 42U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp = src >> 22U; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp = src >> 32U; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp |= src << 22U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp = src >> 42U; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp |= src << 12U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp = src >> 52U; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp |= src << 2U; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp = src >> 8U; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp |= src << 46U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp = src >> 18U; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp |= src << 36U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp = src >> 28U; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp |= src << 26U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp = src >> 38U; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp = src >> 48U; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp |= src << 6U; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp = src >> 4U; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp |= src << 50U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp = src >> 14U; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp = src >> 24U; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp |= src << 30U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp = src >> 34U; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp |= src << 20U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp = src >> 44U; + src = *(in + 16 * 63 + i) - *(a_base_p); + src = src & ((1ULL << 54) - 1); + tmp |= src << 10U; + *(out + i) = tmp; + out -= 848; + } +} +void static ffor_55bit_64ow(const uint64_t* __restrict in, + uint64_t* __restrict out, + const uint64_t* __restrict a_base_p) { + uint64_t tmp = 0U; + uint64_t src; + for (int i = 0; i < 16; i++) { + src = *(in + 16 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp = src; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp |= src << 55U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp = src >> 9U; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp |= src << 46U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp = src >> 18U; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp |= src << 37U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp = src >> 27U; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp = src >> 36U; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp |= src << 19U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp = src >> 45U; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp |= src << 10U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp = src >> 54U; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp |= src << 1U; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp = src >> 8U; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp |= src << 47U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp = src >> 17U; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp |= src << 38U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp = src >> 26U; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp |= src << 29U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp = src >> 35U; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp |= src << 20U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp = src >> 44U; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp |= src << 11U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp = src >> 53U; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp |= src << 2U; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp |= src << 57U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp = src >> 7U; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp = src >> 16U; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp |= src << 39U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp = src >> 25U; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp |= src << 30U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp = src >> 34U; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp |= src << 21U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp = src >> 43U; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp |= src << 12U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp = src >> 52U; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp |= src << 3U; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp |= src << 58U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp = src >> 6U; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp |= src << 49U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp = src >> 15U; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp = src >> 24U; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp |= src << 31U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp = src >> 33U; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp |= src << 22U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp = src >> 42U; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp |= src << 13U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp = src >> 51U; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp |= src << 4U; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp |= src << 59U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp = src >> 5U; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp |= src << 50U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp = src >> 14U; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp |= src << 41U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp = src >> 23U; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp = src >> 32U; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp |= src << 23U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp = src >> 41U; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp |= src << 14U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp = src >> 50U; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp |= src << 5U; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp = src >> 4U; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp |= src << 51U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp = src >> 13U; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp |= src << 42U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp = src >> 22U; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp |= src << 33U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp = src >> 31U; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp = src >> 40U; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp |= src << 15U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp = src >> 49U; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp |= src << 6U; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp |= src << 61U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp = src >> 3U; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp = src >> 12U; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp |= src << 43U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp = src >> 21U; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp |= src << 34U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp = src >> 30U; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp |= src << 25U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp = src >> 39U; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp = src >> 48U; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp |= src << 7U; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp |= src << 62U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp = src >> 2U; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp |= src << 53U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp = src >> 11U; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp |= src << 44U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp = src >> 20U; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp |= src << 35U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp = src >> 29U; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp |= src << 26U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp = src >> 38U; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp |= src << 17U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp = src >> 47U; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp |= src << 8U; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp |= src << 63U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp = src >> 1U; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp |= src << 54U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp = src >> 10U; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp |= src << 45U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp = src >> 19U; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp |= src << 36U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp = src >> 28U; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp |= src << 27U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp = src >> 37U; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp |= src << 18U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp = src >> 46U; + src = *(in + 16 * 63 + i) - *(a_base_p); + src = src & ((1ULL << 55) - 1); + tmp |= src << 9U; + *(out + i) = tmp; + out -= 864; + } +} +void static ffor_56bit_64ow(const uint64_t* __restrict in, + uint64_t* __restrict out, + const uint64_t* __restrict a_base_p) { + uint64_t tmp = 0U; + uint64_t src; + for (int i = 0; i < 16; i++) { + src = *(in + 16 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp = src; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp = src >> 8U; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp = src >> 16U; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp = src >> 24U; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp = src >> 32U; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp = src >> 40U; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp = src >> 48U; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp |= src << 8U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp = src; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp = src >> 8U; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp = src >> 16U; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp = src >> 24U; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp = src >> 32U; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp = src >> 40U; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp = src >> 48U; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp |= src << 8U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp = src; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp = src >> 8U; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp = src >> 16U; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp = src >> 24U; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp = src >> 32U; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp = src >> 40U; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp = src >> 48U; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp |= src << 8U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp = src; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp = src >> 8U; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp = src >> 16U; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp = src >> 24U; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp = src >> 32U; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp = src >> 40U; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp = src >> 48U; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp |= src << 8U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp = src; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp = src >> 8U; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp = src >> 16U; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp = src >> 24U; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp = src >> 32U; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp = src >> 40U; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp = src >> 48U; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp |= src << 8U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp = src; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp = src >> 8U; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp = src >> 16U; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp = src >> 24U; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp = src >> 32U; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp = src >> 40U; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp = src >> 48U; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp |= src << 8U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp = src; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp = src >> 8U; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp = src >> 16U; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp = src >> 24U; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp = src >> 32U; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp = src >> 40U; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp = src >> 48U; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp |= src << 8U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp = src; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp = src >> 8U; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp = src >> 16U; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp = src >> 24U; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp = src >> 32U; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp = src >> 40U; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp = src >> 48U; + src = *(in + 16 * 63 + i) - *(a_base_p); + src = src & ((1ULL << 56) - 1); + tmp |= src << 8U; + *(out + i) = tmp; + out -= 880; + } +} +void static ffor_57bit_64ow(const uint64_t* __restrict in, + uint64_t* __restrict out, + const uint64_t* __restrict a_base_p) { + uint64_t tmp = 0U; + uint64_t src; + for (int i = 0; i < 16; i++) { + src = *(in + 16 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp = src; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp |= src << 57U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp = src >> 7U; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp |= src << 50U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp = src >> 14U; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp |= src << 43U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp = src >> 21U; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp |= src << 36U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp = src >> 28U; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp |= src << 29U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp = src >> 35U; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp |= src << 22U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp = src >> 42U; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp |= src << 15U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp = src >> 49U; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp |= src << 8U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp = src >> 56U; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp |= src << 1U; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp |= src << 58U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp = src >> 6U; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp |= src << 51U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp = src >> 13U; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp |= src << 44U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp = src >> 20U; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp |= src << 37U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp = src >> 27U; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp |= src << 30U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp = src >> 34U; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp |= src << 23U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp = src >> 41U; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp = src >> 48U; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp |= src << 9U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp = src >> 55U; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp |= src << 2U; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp |= src << 59U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp = src >> 5U; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp = src >> 12U; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp |= src << 45U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp = src >> 19U; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp |= src << 38U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp = src >> 26U; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp |= src << 31U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp = src >> 33U; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp = src >> 40U; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp |= src << 17U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp = src >> 47U; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp |= src << 10U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp = src >> 54U; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp |= src << 3U; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp = src >> 4U; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp |= src << 53U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp = src >> 11U; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp |= src << 46U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp = src >> 18U; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp |= src << 39U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp = src >> 25U; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp = src >> 32U; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp |= src << 25U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp = src >> 39U; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp |= src << 18U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp = src >> 46U; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp |= src << 11U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp = src >> 53U; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp |= src << 4U; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp |= src << 61U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp = src >> 3U; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp |= src << 54U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp = src >> 10U; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp |= src << 47U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp = src >> 17U; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp = src >> 24U; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp |= src << 33U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp = src >> 31U; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp |= src << 26U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp = src >> 38U; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp |= src << 19U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp = src >> 45U; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp |= src << 12U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp = src >> 52U; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp |= src << 5U; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp |= src << 62U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp = src >> 2U; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp |= src << 55U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp = src >> 9U; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp = src >> 16U; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp |= src << 41U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp = src >> 23U; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp |= src << 34U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp = src >> 30U; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp |= src << 27U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp = src >> 37U; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp |= src << 20U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp = src >> 44U; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp |= src << 13U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp = src >> 51U; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp |= src << 6U; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp |= src << 63U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp = src >> 1U; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp = src >> 8U; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp |= src << 49U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp = src >> 15U; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp |= src << 42U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp = src >> 22U; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp |= src << 35U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp = src >> 29U; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp = src >> 36U; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp |= src << 21U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp = src >> 43U; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp |= src << 14U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp = src >> 50U; + src = *(in + 16 * 63 + i) - *(a_base_p); + src = src & ((1ULL << 57) - 1); + tmp |= src << 7U; + *(out + i) = tmp; + out -= 896; + } +} +void static ffor_58bit_64ow(const uint64_t* __restrict in, + uint64_t* __restrict out, + const uint64_t* __restrict a_base_p) { + uint64_t tmp = 0U; + uint64_t src; + for (int i = 0; i < 16; i++) { + src = *(in + 16 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp = src; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp |= src << 58U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp = src >> 6U; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp = src >> 12U; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp |= src << 46U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp = src >> 18U; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp = src >> 24U; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp |= src << 34U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp = src >> 30U; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp = src >> 36U; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp |= src << 22U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp = src >> 42U; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp = src >> 48U; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp |= src << 10U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp = src >> 54U; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp |= src << 4U; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp |= src << 62U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp = src >> 2U; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp = src >> 8U; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp |= src << 50U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp = src >> 14U; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp |= src << 44U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp = src >> 20U; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp |= src << 38U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp = src >> 26U; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp = src >> 32U; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp |= src << 26U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp = src >> 38U; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp |= src << 20U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp = src >> 44U; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp |= src << 14U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp = src >> 50U; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp |= src << 8U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp = src >> 56U; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp |= src << 2U; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp = src >> 4U; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp |= src << 54U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp = src >> 10U; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp = src >> 16U; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp |= src << 42U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp = src >> 22U; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp |= src << 36U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp = src >> 28U; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp |= src << 30U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp = src >> 34U; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp = src >> 40U; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp |= src << 18U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp = src >> 46U; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp |= src << 12U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp = src >> 52U; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp |= src << 6U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp = src; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp |= src << 58U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp = src >> 6U; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp = src >> 12U; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp |= src << 46U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp = src >> 18U; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp = src >> 24U; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp |= src << 34U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp = src >> 30U; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp = src >> 36U; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp |= src << 22U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp = src >> 42U; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp = src >> 48U; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp |= src << 10U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp = src >> 54U; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp |= src << 4U; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp |= src << 62U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp = src >> 2U; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp = src >> 8U; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp |= src << 50U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp = src >> 14U; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp |= src << 44U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp = src >> 20U; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp |= src << 38U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp = src >> 26U; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp = src >> 32U; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp |= src << 26U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp = src >> 38U; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp |= src << 20U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp = src >> 44U; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp |= src << 14U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp = src >> 50U; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp |= src << 8U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp = src >> 56U; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp |= src << 2U; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp = src >> 4U; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp |= src << 54U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp = src >> 10U; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp = src >> 16U; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp |= src << 42U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp = src >> 22U; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp |= src << 36U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp = src >> 28U; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp |= src << 30U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp = src >> 34U; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp = src >> 40U; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp |= src << 18U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp = src >> 46U; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp |= src << 12U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp = src >> 52U; + src = *(in + 16 * 63 + i) - *(a_base_p); + src = src & ((1ULL << 58) - 1); + tmp |= src << 6U; + *(out + i) = tmp; + out -= 912; + } +} +void static ffor_59bit_64ow(const uint64_t* __restrict in, + uint64_t* __restrict out, + const uint64_t* __restrict a_base_p) { + uint64_t tmp = 0U; + uint64_t src; + for (int i = 0; i < 16; i++) { + src = *(in + 16 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp = src; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp |= src << 59U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp = src >> 5U; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp |= src << 54U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp = src >> 10U; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp |= src << 49U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp = src >> 15U; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp |= src << 44U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp = src >> 20U; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp |= src << 39U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp = src >> 25U; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp |= src << 34U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp = src >> 30U; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp |= src << 29U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp = src >> 35U; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp = src >> 40U; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp |= src << 19U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp = src >> 45U; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp |= src << 14U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp = src >> 50U; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp |= src << 9U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp = src >> 55U; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp |= src << 4U; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp |= src << 63U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp = src >> 1U; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp |= src << 58U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp = src >> 6U; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp |= src << 53U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp = src >> 11U; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp = src >> 16U; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp |= src << 43U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp = src >> 21U; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp |= src << 38U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp = src >> 26U; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp |= src << 33U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp = src >> 31U; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp = src >> 36U; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp |= src << 23U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp = src >> 41U; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp |= src << 18U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp = src >> 46U; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp |= src << 13U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp = src >> 51U; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp |= src << 8U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp = src >> 56U; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp |= src << 3U; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp |= src << 62U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp = src >> 2U; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp |= src << 57U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp = src >> 7U; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp = src >> 12U; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp |= src << 47U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp = src >> 17U; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp |= src << 42U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp = src >> 22U; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp |= src << 37U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp = src >> 27U; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp = src >> 32U; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp |= src << 27U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp = src >> 37U; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp |= src << 22U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp = src >> 42U; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp |= src << 17U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp = src >> 47U; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp |= src << 12U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp = src >> 52U; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp |= src << 7U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp = src >> 57U; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp |= src << 2U; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp |= src << 61U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp = src >> 3U; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp = src >> 8U; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp |= src << 51U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp = src >> 13U; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp |= src << 46U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp = src >> 18U; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp |= src << 41U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp = src >> 23U; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp |= src << 36U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp = src >> 28U; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp |= src << 31U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp = src >> 33U; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp |= src << 26U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp = src >> 38U; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp |= src << 21U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp = src >> 43U; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp = src >> 48U; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp |= src << 11U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp = src >> 53U; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp |= src << 6U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp = src >> 58U; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp |= src << 1U; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp = src >> 4U; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp |= src << 55U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp = src >> 9U; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp |= src << 50U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp = src >> 14U; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp |= src << 45U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp = src >> 19U; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp = src >> 24U; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp |= src << 35U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp = src >> 29U; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp |= src << 30U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp = src >> 34U; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp |= src << 25U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp = src >> 39U; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp |= src << 20U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp = src >> 44U; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp |= src << 15U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp = src >> 49U; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp |= src << 10U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp = src >> 54U; + src = *(in + 16 * 63 + i) - *(a_base_p); + src = src & ((1ULL << 59) - 1); + tmp |= src << 5U; + *(out + i) = tmp; + out -= 928; + } +} +void static ffor_60bit_64ow(const uint64_t* __restrict in, + uint64_t* __restrict out, + const uint64_t* __restrict a_base_p) { + uint64_t tmp = 0U; + uint64_t src; + for (int i = 0; i < 16; i++) { + src = *(in + 16 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp = src; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp = src >> 4U; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp = src >> 8U; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp = src >> 12U; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp = src >> 16U; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp |= src << 44U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp = src >> 20U; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp = src >> 24U; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp |= src << 36U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp = src >> 28U; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp = src >> 32U; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp = src >> 36U; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp = src >> 40U; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp |= src << 20U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp = src >> 44U; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp = src >> 48U; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp |= src << 12U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp = src >> 52U; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp |= src << 8U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp = src >> 56U; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp |= src << 4U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp = src; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp = src >> 4U; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp = src >> 8U; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp = src >> 12U; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp = src >> 16U; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp |= src << 44U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp = src >> 20U; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp = src >> 24U; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp |= src << 36U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp = src >> 28U; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp = src >> 32U; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp = src >> 36U; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp = src >> 40U; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp |= src << 20U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp = src >> 44U; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp = src >> 48U; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp |= src << 12U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp = src >> 52U; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp |= src << 8U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp = src >> 56U; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp |= src << 4U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp = src; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp = src >> 4U; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp = src >> 8U; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp = src >> 12U; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp = src >> 16U; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp |= src << 44U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp = src >> 20U; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp = src >> 24U; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp |= src << 36U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp = src >> 28U; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp = src >> 32U; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp = src >> 36U; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp = src >> 40U; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp |= src << 20U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp = src >> 44U; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp = src >> 48U; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp |= src << 12U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp = src >> 52U; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp |= src << 8U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp = src >> 56U; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp |= src << 4U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp = src; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp = src >> 4U; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp = src >> 8U; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp = src >> 12U; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp = src >> 16U; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp |= src << 44U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp = src >> 20U; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp = src >> 24U; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp |= src << 36U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp = src >> 28U; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp = src >> 32U; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp = src >> 36U; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp = src >> 40U; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp |= src << 20U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp = src >> 44U; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp = src >> 48U; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp |= src << 12U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp = src >> 52U; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp |= src << 8U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp = src >> 56U; + src = *(in + 16 * 63 + i) - *(a_base_p); + src = src & ((1ULL << 60) - 1); + tmp |= src << 4U; + *(out + i) = tmp; + out -= 944; + } +} +void static ffor_61bit_64ow(const uint64_t* __restrict in, + uint64_t* __restrict out, + const uint64_t* __restrict a_base_p) { + uint64_t tmp = 0U; + uint64_t src; + for (int i = 0; i < 16; i++) { + src = *(in + 16 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp = src; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp |= src << 61U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp = src >> 3U; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp |= src << 58U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp = src >> 6U; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp |= src << 55U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp = src >> 9U; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp = src >> 12U; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp |= src << 49U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp = src >> 15U; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp |= src << 46U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp = src >> 18U; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp |= src << 43U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp = src >> 21U; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp = src >> 24U; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp |= src << 37U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp = src >> 27U; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp |= src << 34U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp = src >> 30U; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp |= src << 31U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp = src >> 33U; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp = src >> 36U; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp |= src << 25U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp = src >> 39U; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp |= src << 22U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp = src >> 42U; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp |= src << 19U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp = src >> 45U; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp = src >> 48U; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp |= src << 13U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp = src >> 51U; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp |= src << 10U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp = src >> 54U; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp |= src << 7U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp = src >> 57U; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp |= src << 4U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp = src >> 60U; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp |= src << 1U; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp |= src << 62U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp = src >> 2U; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp |= src << 59U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp = src >> 5U; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp = src >> 8U; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp |= src << 53U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp = src >> 11U; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp |= src << 50U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp = src >> 14U; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp |= src << 47U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp = src >> 17U; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp |= src << 44U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp = src >> 20U; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp |= src << 41U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp = src >> 23U; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp |= src << 38U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp = src >> 26U; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp |= src << 35U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp = src >> 29U; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp = src >> 32U; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp |= src << 29U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp = src >> 35U; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp |= src << 26U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp = src >> 38U; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp |= src << 23U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp = src >> 41U; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp |= src << 20U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp = src >> 44U; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp |= src << 17U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp = src >> 47U; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp |= src << 14U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp = src >> 50U; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp |= src << 11U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp = src >> 53U; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp |= src << 8U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp = src >> 56U; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp |= src << 5U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp = src >> 59U; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp |= src << 2U; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp |= src << 63U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp = src >> 1U; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp = src >> 4U; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp |= src << 57U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp = src >> 7U; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp |= src << 54U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp = src >> 10U; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp |= src << 51U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp = src >> 13U; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp = src >> 16U; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp |= src << 45U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp = src >> 19U; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp |= src << 42U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp = src >> 22U; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp |= src << 39U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp = src >> 25U; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp |= src << 36U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp = src >> 28U; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp |= src << 33U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp = src >> 31U; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp |= src << 30U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp = src >> 34U; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp |= src << 27U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp = src >> 37U; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp = src >> 40U; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp |= src << 21U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp = src >> 43U; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp |= src << 18U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp = src >> 46U; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp |= src << 15U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp = src >> 49U; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp |= src << 12U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp = src >> 52U; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp |= src << 9U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp = src >> 55U; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp |= src << 6U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp = src >> 58U; + src = *(in + 16 * 63 + i) - *(a_base_p); + src = src & ((1ULL << 61) - 1); + tmp |= src << 3U; + *(out + i) = tmp; + out -= 960; + } +} +void static ffor_62bit_64ow(const uint64_t* __restrict in, + uint64_t* __restrict out, + const uint64_t* __restrict a_base_p) { + uint64_t tmp = 0U; + uint64_t src; + for (int i = 0; i < 16; i++) { + src = *(in + 16 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp = src; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp |= src << 62U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp = src >> 2U; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp = src >> 4U; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp |= src << 58U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp = src >> 6U; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp = src >> 8U; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp |= src << 54U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp = src >> 10U; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp = src >> 12U; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp |= src << 50U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp = src >> 14U; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp = src >> 16U; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp |= src << 46U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp = src >> 18U; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp |= src << 44U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp = src >> 20U; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp |= src << 42U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp = src >> 22U; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp = src >> 24U; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp |= src << 38U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp = src >> 26U; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp |= src << 36U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp = src >> 28U; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp |= src << 34U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp = src >> 30U; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp = src >> 32U; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp |= src << 30U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp = src >> 34U; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp = src >> 36U; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp |= src << 26U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp = src >> 38U; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp = src >> 40U; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp |= src << 22U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp = src >> 42U; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp |= src << 20U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp = src >> 44U; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp |= src << 18U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp = src >> 46U; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp = src >> 48U; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp |= src << 14U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp = src >> 50U; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp |= src << 12U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp = src >> 52U; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp |= src << 10U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp = src >> 54U; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp |= src << 8U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp = src >> 56U; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp |= src << 6U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp = src >> 58U; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp |= src << 4U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp = src >> 60U; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp |= src << 2U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp = src; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp |= src << 62U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp = src >> 2U; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp = src >> 4U; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp |= src << 58U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp = src >> 6U; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp = src >> 8U; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp |= src << 54U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp = src >> 10U; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp = src >> 12U; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp |= src << 50U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp = src >> 14U; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp = src >> 16U; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp |= src << 46U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp = src >> 18U; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp |= src << 44U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp = src >> 20U; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp |= src << 42U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp = src >> 22U; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp = src >> 24U; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp |= src << 38U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp = src >> 26U; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp |= src << 36U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp = src >> 28U; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp |= src << 34U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp = src >> 30U; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp = src >> 32U; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp |= src << 30U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp = src >> 34U; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp = src >> 36U; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp |= src << 26U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp = src >> 38U; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp = src >> 40U; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp |= src << 22U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp = src >> 42U; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp |= src << 20U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp = src >> 44U; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp |= src << 18U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp = src >> 46U; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp = src >> 48U; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp |= src << 14U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp = src >> 50U; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp |= src << 12U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp = src >> 52U; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp |= src << 10U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp = src >> 54U; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp |= src << 8U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp = src >> 56U; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp |= src << 6U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp = src >> 58U; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp |= src << 4U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp = src >> 60U; + src = *(in + 16 * 63 + i) - *(a_base_p); + src = src & ((1ULL << 62) - 1); + tmp |= src << 2U; + *(out + i) = tmp; + out -= 976; + } +} +void static ffor_63bit_64ow(const uint64_t* __restrict in, + uint64_t* __restrict out, + const uint64_t* __restrict a_base_p) { + uint64_t tmp = 0U; + uint64_t src; + for (int i = 0; i < 16; i++) { + src = *(in + 16 * 0 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp = src; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp |= src << 63U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 1 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp = src >> 1U; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp |= src << 62U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 2 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp = src >> 2U; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp |= src << 61U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 3 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp = src >> 3U; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp |= src << 60U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 4 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp = src >> 4U; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp |= src << 59U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 5 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp = src >> 5U; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp |= src << 58U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 6 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp = src >> 6U; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp |= src << 57U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 7 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp = src >> 7U; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp |= src << 56U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 8 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp = src >> 8U; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp |= src << 55U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 9 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp = src >> 9U; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp |= src << 54U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 10 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp = src >> 10U; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp |= src << 53U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 11 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp = src >> 11U; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp |= src << 52U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 12 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp = src >> 12U; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp |= src << 51U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 13 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp = src >> 13U; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp |= src << 50U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 14 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp = src >> 14U; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp |= src << 49U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 15 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp = src >> 15U; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp |= src << 48U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 16 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp = src >> 16U; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp |= src << 47U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 17 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp = src >> 17U; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp |= src << 46U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 18 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp = src >> 18U; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp |= src << 45U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 19 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp = src >> 19U; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp |= src << 44U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 20 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp = src >> 20U; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp |= src << 43U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 21 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp = src >> 21U; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp |= src << 42U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 22 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp = src >> 22U; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp |= src << 41U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 23 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp = src >> 23U; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp |= src << 40U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 24 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp = src >> 24U; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp |= src << 39U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 25 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp = src >> 25U; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp |= src << 38U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 26 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp = src >> 26U; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp |= src << 37U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 27 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp = src >> 27U; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp |= src << 36U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 28 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp = src >> 28U; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp |= src << 35U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 29 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp = src >> 29U; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp |= src << 34U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 30 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp = src >> 30U; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp |= src << 33U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 31 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp = src >> 31U; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp |= src << 32U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 32 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp = src >> 32U; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp |= src << 31U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 33 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp = src >> 33U; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp |= src << 30U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 34 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp = src >> 34U; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp |= src << 29U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 35 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp = src >> 35U; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp |= src << 28U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 36 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp = src >> 36U; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp |= src << 27U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 37 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp = src >> 37U; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp |= src << 26U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 38 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp = src >> 38U; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp |= src << 25U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 39 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp = src >> 39U; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp |= src << 24U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 40 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp = src >> 40U; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp |= src << 23U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 41 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp = src >> 41U; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp |= src << 22U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 42 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp = src >> 42U; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp |= src << 21U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 43 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp = src >> 43U; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp |= src << 20U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 44 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp = src >> 44U; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp |= src << 19U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 45 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp = src >> 45U; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp |= src << 18U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 46 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp = src >> 46U; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp |= src << 17U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 47 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp = src >> 47U; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp |= src << 16U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 48 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp = src >> 48U; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp |= src << 15U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 49 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp = src >> 49U; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp |= src << 14U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 50 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp = src >> 50U; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp |= src << 13U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 51 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp = src >> 51U; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp |= src << 12U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 52 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp = src >> 52U; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp |= src << 11U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 53 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp = src >> 53U; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp |= src << 10U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 54 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp = src >> 54U; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp |= src << 9U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 55 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp = src >> 55U; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp |= src << 8U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 56 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp = src >> 56U; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp |= src << 7U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 57 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp = src >> 57U; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp |= src << 6U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 58 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp = src >> 58U; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp |= src << 5U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 59 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp = src >> 59U; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp |= src << 4U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 60 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp = src >> 60U; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp |= src << 3U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 61 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp = src >> 61U; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp |= src << 2U; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 62 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp = src >> 62U; + src = *(in + 16 * 63 + i) - *(a_base_p); + src = src & ((1ULL << 63) - 1); + tmp |= src << 1U; + *(out + i) = tmp; + out -= 992; + } +} +void static ffor_64bit_64ow(const uint64_t* __restrict in, + uint64_t* __restrict out, + const uint64_t* __restrict a_base_p) { + uint64_t tmp = 0U; + uint64_t src; + for (int i = 0; i < 16; i++) { + src = *(in + 16 * 0 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 1 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 2 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 3 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 4 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 5 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 6 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 7 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 8 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 9 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 10 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 11 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 12 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 13 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 14 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 15 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 16 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 17 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 18 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 19 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 20 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 21 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 22 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 23 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 24 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 25 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 26 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 27 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 28 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 29 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 30 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 31 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 32 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 33 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 34 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 35 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 36 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 37 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 38 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 39 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 40 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 41 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 42 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 43 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 44 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 45 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 46 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 47 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 48 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 49 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 50 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 51 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 52 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 53 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 54 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 55 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 56 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 57 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 58 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 59 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 60 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 61 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 62 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out += 16; + src = *(in + 16 * 63 + i) - *(a_base_p); + tmp = src; + *(out + i) = tmp; + out -= 1008; + } +} +void ffor(const uint8_t* __restrict in, uint8_t* __restrict out, uint8_t bw, const uint8_t* __restrict a_base_p) { + switch (bw) { + case 0: + ffor_0bit_8ow(in, out, a_base_p); + return; + case 1: + ffor_1bit_8ow(in, out, a_base_p); + return; + case 2: + ffor_2bit_8ow(in, out, a_base_p); + return; + case 3: + ffor_3bit_8ow(in, out, a_base_p); + return; + case 4: + ffor_4bit_8ow(in, out, a_base_p); + return; + case 5: + ffor_5bit_8ow(in, out, a_base_p); + return; + case 6: + ffor_6bit_8ow(in, out, a_base_p); + return; + case 7: + ffor_7bit_8ow(in, out, a_base_p); + return; + case 8: + ffor_8bit_8ow(in, out, a_base_p); + return; + } +} +void ffor(const uint16_t* __restrict in, uint16_t* __restrict out, uint8_t bw, const uint16_t* __restrict a_base_p) { + switch (bw) { + case 0: + ffor_0bit_16ow(in, out, a_base_p); + return; + case 1: + ffor_1bit_16ow(in, out, a_base_p); + return; + case 2: + ffor_2bit_16ow(in, out, a_base_p); + return; + case 3: + ffor_3bit_16ow(in, out, a_base_p); + return; + case 4: + ffor_4bit_16ow(in, out, a_base_p); + return; + case 5: + ffor_5bit_16ow(in, out, a_base_p); + return; + case 6: + ffor_6bit_16ow(in, out, a_base_p); + return; + case 7: + ffor_7bit_16ow(in, out, a_base_p); + return; + case 8: + ffor_8bit_16ow(in, out, a_base_p); + return; + case 9: + ffor_9bit_16ow(in, out, a_base_p); + return; + case 10: + ffor_10bit_16ow(in, out, a_base_p); + return; + case 11: + ffor_11bit_16ow(in, out, a_base_p); + return; + case 12: + ffor_12bit_16ow(in, out, a_base_p); + return; + case 13: + ffor_13bit_16ow(in, out, a_base_p); + return; + case 14: + ffor_14bit_16ow(in, out, a_base_p); + return; + case 15: + ffor_15bit_16ow(in, out, a_base_p); + return; + case 16: + ffor_16bit_16ow(in, out, a_base_p); + return; + } +} +void ffor(const uint32_t* __restrict in, uint32_t* __restrict out, uint8_t bw, const uint32_t* __restrict a_base_p) { + switch (bw) { + case 0: + ffor_0bit_32ow(in, out, a_base_p); + return; + case 1: + ffor_1bit_32ow(in, out, a_base_p); + return; + case 2: + ffor_2bit_32ow(in, out, a_base_p); + return; + case 3: + ffor_3bit_32ow(in, out, a_base_p); + return; + case 4: + ffor_4bit_32ow(in, out, a_base_p); + return; + case 5: + ffor_5bit_32ow(in, out, a_base_p); + return; + case 6: + ffor_6bit_32ow(in, out, a_base_p); + return; + case 7: + ffor_7bit_32ow(in, out, a_base_p); + return; + case 8: + ffor_8bit_32ow(in, out, a_base_p); + return; + case 9: + ffor_9bit_32ow(in, out, a_base_p); + return; + case 10: + ffor_10bit_32ow(in, out, a_base_p); + return; + case 11: + ffor_11bit_32ow(in, out, a_base_p); + return; + case 12: + ffor_12bit_32ow(in, out, a_base_p); + return; + case 13: + ffor_13bit_32ow(in, out, a_base_p); + return; + case 14: + ffor_14bit_32ow(in, out, a_base_p); + return; + case 15: + ffor_15bit_32ow(in, out, a_base_p); + return; + case 16: + ffor_16bit_32ow(in, out, a_base_p); + return; + case 17: + ffor_17bit_32ow(in, out, a_base_p); + return; + case 18: + ffor_18bit_32ow(in, out, a_base_p); + return; + case 19: + ffor_19bit_32ow(in, out, a_base_p); + return; + case 20: + ffor_20bit_32ow(in, out, a_base_p); + return; + case 21: + ffor_21bit_32ow(in, out, a_base_p); + return; + case 22: + ffor_22bit_32ow(in, out, a_base_p); + return; + case 23: + ffor_23bit_32ow(in, out, a_base_p); + return; + case 24: + ffor_24bit_32ow(in, out, a_base_p); + return; + case 25: + ffor_25bit_32ow(in, out, a_base_p); + return; + case 26: + ffor_26bit_32ow(in, out, a_base_p); + return; + case 27: + ffor_27bit_32ow(in, out, a_base_p); + return; + case 28: + ffor_28bit_32ow(in, out, a_base_p); + return; + case 29: + ffor_29bit_32ow(in, out, a_base_p); + return; + case 30: + ffor_30bit_32ow(in, out, a_base_p); + return; + case 31: + ffor_31bit_32ow(in, out, a_base_p); + return; + case 32: + ffor_32bit_32ow(in, out, a_base_p); + return; + } +} +void ffor(const uint64_t* __restrict in, uint64_t* __restrict out, uint8_t bw, const uint64_t* __restrict a_base_p) { + switch (bw) { + case 0: + ffor_0bit_64ow(in, out, a_base_p); + return; + case 1: + ffor_1bit_64ow(in, out, a_base_p); + return; + case 2: + ffor_2bit_64ow(in, out, a_base_p); + return; + case 3: + ffor_3bit_64ow(in, out, a_base_p); + return; + case 4: + ffor_4bit_64ow(in, out, a_base_p); + return; + case 5: + ffor_5bit_64ow(in, out, a_base_p); + return; + case 6: + ffor_6bit_64ow(in, out, a_base_p); + return; + case 7: + ffor_7bit_64ow(in, out, a_base_p); + return; + case 8: + ffor_8bit_64ow(in, out, a_base_p); + return; + case 9: + ffor_9bit_64ow(in, out, a_base_p); + return; + case 10: + ffor_10bit_64ow(in, out, a_base_p); + return; + case 11: + ffor_11bit_64ow(in, out, a_base_p); + return; + case 12: + ffor_12bit_64ow(in, out, a_base_p); + return; + case 13: + ffor_13bit_64ow(in, out, a_base_p); + return; + case 14: + ffor_14bit_64ow(in, out, a_base_p); + return; + case 15: + ffor_15bit_64ow(in, out, a_base_p); + return; + case 16: + ffor_16bit_64ow(in, out, a_base_p); + return; + case 17: + ffor_17bit_64ow(in, out, a_base_p); + return; + case 18: + ffor_18bit_64ow(in, out, a_base_p); + return; + case 19: + ffor_19bit_64ow(in, out, a_base_p); + return; + case 20: + ffor_20bit_64ow(in, out, a_base_p); + return; + case 21: + ffor_21bit_64ow(in, out, a_base_p); + return; + case 22: + ffor_22bit_64ow(in, out, a_base_p); + return; + case 23: + ffor_23bit_64ow(in, out, a_base_p); + return; + case 24: + ffor_24bit_64ow(in, out, a_base_p); + return; + case 25: + ffor_25bit_64ow(in, out, a_base_p); + return; + case 26: + ffor_26bit_64ow(in, out, a_base_p); + return; + case 27: + ffor_27bit_64ow(in, out, a_base_p); + return; + case 28: + ffor_28bit_64ow(in, out, a_base_p); + return; + case 29: + ffor_29bit_64ow(in, out, a_base_p); + return; + case 30: + ffor_30bit_64ow(in, out, a_base_p); + return; + case 31: + ffor_31bit_64ow(in, out, a_base_p); + return; + case 32: + ffor_32bit_64ow(in, out, a_base_p); + return; + case 33: + ffor_33bit_64ow(in, out, a_base_p); + return; + case 34: + ffor_34bit_64ow(in, out, a_base_p); + return; + case 35: + ffor_35bit_64ow(in, out, a_base_p); + return; + case 36: + ffor_36bit_64ow(in, out, a_base_p); + return; + case 37: + ffor_37bit_64ow(in, out, a_base_p); + return; + case 38: + ffor_38bit_64ow(in, out, a_base_p); + return; + case 39: + ffor_39bit_64ow(in, out, a_base_p); + return; + case 40: + ffor_40bit_64ow(in, out, a_base_p); + return; + case 41: + ffor_41bit_64ow(in, out, a_base_p); + return; + case 42: + ffor_42bit_64ow(in, out, a_base_p); + return; + case 43: + ffor_43bit_64ow(in, out, a_base_p); + return; + case 44: + ffor_44bit_64ow(in, out, a_base_p); + return; + case 45: + ffor_45bit_64ow(in, out, a_base_p); + return; + case 46: + ffor_46bit_64ow(in, out, a_base_p); + return; + case 47: + ffor_47bit_64ow(in, out, a_base_p); + return; + case 48: + ffor_48bit_64ow(in, out, a_base_p); + return; + case 49: + ffor_49bit_64ow(in, out, a_base_p); + return; + case 50: + ffor_50bit_64ow(in, out, a_base_p); + return; + case 51: + ffor_51bit_64ow(in, out, a_base_p); + return; + case 52: + ffor_52bit_64ow(in, out, a_base_p); + return; + case 53: + ffor_53bit_64ow(in, out, a_base_p); + return; + case 54: + ffor_54bit_64ow(in, out, a_base_p); + return; + case 55: + ffor_55bit_64ow(in, out, a_base_p); + return; + case 56: + ffor_56bit_64ow(in, out, a_base_p); + return; + case 57: + ffor_57bit_64ow(in, out, a_base_p); + return; + case 58: + ffor_58bit_64ow(in, out, a_base_p); + return; + case 59: + ffor_59bit_64ow(in, out, a_base_p); + return; + case 60: + ffor_60bit_64ow(in, out, a_base_p); + return; + case 61: + ffor_61bit_64ow(in, out, a_base_p); + return; + case 62: + ffor_62bit_64ow(in, out, a_base_p); + return; + case 63: + ffor_63bit_64ow(in, out, a_base_p); + return; + case 64: + ffor_64bit_64ow(in, out, a_base_p); + return; + } +} +}}}} // namespace fastlanes::generated::ffor::fallback::scalar diff --git a/src/fastlanes_generated_unffor.cpp b/src/fastlanes_generated_unffor.cpp new file mode 100644 index 0000000..6e35d01 --- /dev/null +++ b/src/fastlanes_generated_unffor.cpp @@ -0,0 +1,23212 @@ +#include "fastlanes/unffor.hpp" + +namespace fastlanes { namespace generated { namespace unffor::fallback { namespace scalar { +static void unffor_0bw_8ow_8crw_1uf(const uint8_t* __restrict a_in_p, + uint8_t* __restrict a_out_p, + const uint8_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint8_t register_0; + [[maybe_unused]] uint8_t tmp_0; + [[maybe_unused]] uint8_t base_0 = *(a_base_p); + for (int i = 0; i < 128; ++i) { + *(out + (i * 1) + (0 * 128) + (128 * 0)) = base_0; + *(out + (i * 1) + (0 * 128) + (128 * 1)) = base_0; + *(out + (i * 1) + (0 * 128) + (128 * 2)) = base_0; + *(out + (i * 1) + (0 * 128) + (128 * 3)) = base_0; + *(out + (i * 1) + (0 * 128) + (128 * 4)) = base_0; + *(out + (i * 1) + (0 * 128) + (128 * 5)) = base_0; + *(out + (i * 1) + (0 * 128) + (128 * 6)) = base_0; + *(out + (i * 1) + (0 * 128) + (128 * 7)) = base_0; + } +} +static void unffor_1bw_8ow_8crw_1uf(const uint8_t* __restrict a_in_p, + uint8_t* __restrict a_out_p, + const uint8_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint8_t register_0; + [[maybe_unused]] uint8_t tmp_0; + [[maybe_unused]] uint8_t base_0 = *(a_base_p); + for (int i = 0; i < 128; ++i) { + register_0 = *(in + (0 * 128) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 128) + (128 * 0)) = tmp_0; + tmp_0 = (register_0 >> 1) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 128) + (128 * 1)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 128) + (128 * 2)) = tmp_0; + tmp_0 = (register_0 >> 3) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 128) + (128 * 3)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 128) + (128 * 4)) = tmp_0; + tmp_0 = (register_0 >> 5) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 128) + (128 * 5)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 128) + (128 * 6)) = tmp_0; + tmp_0 = (register_0 >> 7) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 128) + (128 * 7)) = tmp_0; + } +} +static void unffor_2bw_8ow_8crw_1uf(const uint8_t* __restrict a_in_p, + uint8_t* __restrict a_out_p, + const uint8_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint8_t register_0; + [[maybe_unused]] uint8_t tmp_0; + [[maybe_unused]] uint8_t base_0 = *(a_base_p); + for (int i = 0; i < 128; ++i) { + register_0 = *(in + (0 * 128) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 128) + (128 * 0)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 128) + (128 * 1)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 128) + (128 * 2)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 128) + (128 * 3)) = tmp_0; + register_0 = *(in + (0 * 128) + (i * 1) + 128); + tmp_0 = (register_0) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 128) + (128 * 4)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 128) + (128 * 5)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 128) + (128 * 6)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 128) + (128 * 7)) = tmp_0; + } +} +static void unffor_3bw_8ow_8crw_1uf(const uint8_t* __restrict a_in_p, + uint8_t* __restrict a_out_p, + const uint8_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint8_t register_0; + [[maybe_unused]] uint8_t tmp_0; + [[maybe_unused]] uint8_t base_0 = *(a_base_p); + for (int i = 0; i < 128; ++i) { + register_0 = *(in + (0 * 128) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 128) + (128 * 0)) = tmp_0; + tmp_0 = (register_0 >> 3) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 128) + (128 * 1)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 128) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 128) + (128 * 2)) = tmp_0; + tmp_0 = (register_0 >> 1) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 128) + (128 * 3)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 128) + (128 * 4)) = tmp_0; + tmp_0 = (register_0 >> 7) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 128) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 1; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 128) + (128 * 5)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 128) + (128 * 6)) = tmp_0; + tmp_0 = (register_0 >> 5) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 128) + (128 * 7)) = tmp_0; + } +} +static void unffor_4bw_8ow_8crw_1uf(const uint8_t* __restrict a_in_p, + uint8_t* __restrict a_out_p, + const uint8_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint8_t register_0; + [[maybe_unused]] uint8_t tmp_0; + [[maybe_unused]] uint8_t base_0 = *(a_base_p); + for (int i = 0; i < 128; ++i) { + register_0 = *(in + (0 * 128) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 128) + (128 * 0)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 128) + (128 * 1)) = tmp_0; + register_0 = *(in + (0 * 128) + (i * 1) + 128); + tmp_0 = (register_0) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 128) + (128 * 2)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 128) + (128 * 3)) = tmp_0; + register_0 = *(in + (0 * 128) + (i * 1) + 256); + tmp_0 = (register_0) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 128) + (128 * 4)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 128) + (128 * 5)) = tmp_0; + register_0 = *(in + (0 * 128) + (i * 1) + 384); + tmp_0 = (register_0) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 128) + (128 * 6)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 128) + (128 * 7)) = tmp_0; + } +} +static void unffor_5bw_8ow_8crw_1uf(const uint8_t* __restrict a_in_p, + uint8_t* __restrict a_out_p, + const uint8_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint8_t register_0; + [[maybe_unused]] uint8_t tmp_0; + [[maybe_unused]] uint8_t base_0 = *(a_base_p); + for (int i = 0; i < 128; ++i) { + register_0 = *(in + (0 * 128) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 128) + (128 * 0)) = tmp_0; + tmp_0 = (register_0 >> 5) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 128) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 3; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 128) + (128 * 1)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 128) + (128 * 2)) = tmp_0; + tmp_0 = (register_0 >> 7) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 128) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 1; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 128) + (128 * 3)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 128) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 128) + (128 * 4)) = tmp_0; + tmp_0 = (register_0 >> 1) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 128) + (128 * 5)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 128) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 128) + (128 * 6)) = tmp_0; + tmp_0 = (register_0 >> 3) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 128) + (128 * 7)) = tmp_0; + } +} +static void unffor_6bw_8ow_8crw_1uf(const uint8_t* __restrict a_in_p, + uint8_t* __restrict a_out_p, + const uint8_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint8_t register_0; + [[maybe_unused]] uint8_t tmp_0; + [[maybe_unused]] uint8_t base_0 = *(a_base_p); + for (int i = 0; i < 128; ++i) { + register_0 = *(in + (0 * 128) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 128) + (128 * 0)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 128) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 128) + (128 * 1)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 128) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 128) + (128 * 2)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 128) + (128 * 3)) = tmp_0; + register_0 = *(in + (0 * 128) + (i * 1) + 384); + tmp_0 = (register_0) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 128) + (128 * 4)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 128) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 128) + (128 * 5)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 128) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 128) + (128 * 6)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 128) + (128 * 7)) = tmp_0; + } +} +static void unffor_7bw_8ow_8crw_1uf(const uint8_t* __restrict a_in_p, + uint8_t* __restrict a_out_p, + const uint8_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint8_t register_0; + [[maybe_unused]] uint8_t tmp_0; + [[maybe_unused]] uint8_t base_0 = *(a_base_p); + for (int i = 0; i < 128; ++i) { + register_0 = *(in + (0 * 128) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 128) + (128 * 0)) = tmp_0; + tmp_0 = (register_0 >> 7) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 128) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 1; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 128) + (128 * 1)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 128) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 128) + (128 * 2)) = tmp_0; + tmp_0 = (register_0 >> 5) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 128) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 3; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 128) + (128 * 3)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 128) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 128) + (128 * 4)) = tmp_0; + tmp_0 = (register_0 >> 3) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 128) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 5; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 128) + (128 * 5)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 128) + (i * 1) + 768); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 128) + (128 * 6)) = tmp_0; + tmp_0 = (register_0 >> 1) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 128) + (128 * 7)) = tmp_0; + } +} +static void unffor_8bw_8ow_8crw_1uf(const uint8_t* __restrict a_in_p, + uint8_t* __restrict a_out_p, + const uint8_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint8_t register_0; + [[maybe_unused]] uint8_t tmp_0; + [[maybe_unused]] uint8_t base_0 = *(a_base_p); + for (int i = 0; i < 128; ++i) { + register_0 = *(in + (0 * 128) + (i * 1) + 0); + register_0 += base_0; + *(out + (i * 1) + (0 * 128) + (128 * 0)) = register_0; + register_0 = *(in + (0 * 128) + (i * 1) + 128); + register_0 += base_0; + *(out + (i * 1) + (0 * 128) + (128 * 1)) = register_0; + register_0 = *(in + (0 * 128) + (i * 1) + 256); + register_0 += base_0; + *(out + (i * 1) + (0 * 128) + (128 * 2)) = register_0; + register_0 = *(in + (0 * 128) + (i * 1) + 384); + register_0 += base_0; + *(out + (i * 1) + (0 * 128) + (128 * 3)) = register_0; + register_0 = *(in + (0 * 128) + (i * 1) + 512); + register_0 += base_0; + *(out + (i * 1) + (0 * 128) + (128 * 4)) = register_0; + register_0 = *(in + (0 * 128) + (i * 1) + 640); + register_0 += base_0; + *(out + (i * 1) + (0 * 128) + (128 * 5)) = register_0; + register_0 = *(in + (0 * 128) + (i * 1) + 768); + register_0 += base_0; + *(out + (i * 1) + (0 * 128) + (128 * 6)) = register_0; + register_0 = *(in + (0 * 128) + (i * 1) + 896); + register_0 += base_0; + *(out + (i * 1) + (0 * 128) + (128 * 7)) = register_0; + } +} +static void unffor_0bw_16ow_16crw_1uf(const uint16_t* __restrict a_in_p, + uint16_t* __restrict a_out_p, + const uint16_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint16_t register_0; + [[maybe_unused]] uint16_t tmp_0; + [[maybe_unused]] uint16_t base_0 = *(a_base_p); + for (int i = 0; i < 64; ++i) { + *(out + (i * 1) + (0 * 64) + (64 * 0)) = base_0; + *(out + (i * 1) + (0 * 64) + (64 * 1)) = base_0; + *(out + (i * 1) + (0 * 64) + (64 * 2)) = base_0; + *(out + (i * 1) + (0 * 64) + (64 * 3)) = base_0; + *(out + (i * 1) + (0 * 64) + (64 * 4)) = base_0; + *(out + (i * 1) + (0 * 64) + (64 * 5)) = base_0; + *(out + (i * 1) + (0 * 64) + (64 * 6)) = base_0; + *(out + (i * 1) + (0 * 64) + (64 * 7)) = base_0; + *(out + (i * 1) + (0 * 64) + (64 * 8)) = base_0; + *(out + (i * 1) + (0 * 64) + (64 * 9)) = base_0; + *(out + (i * 1) + (0 * 64) + (64 * 10)) = base_0; + *(out + (i * 1) + (0 * 64) + (64 * 11)) = base_0; + *(out + (i * 1) + (0 * 64) + (64 * 12)) = base_0; + *(out + (i * 1) + (0 * 64) + (64 * 13)) = base_0; + *(out + (i * 1) + (0 * 64) + (64 * 14)) = base_0; + *(out + (i * 1) + (0 * 64) + (64 * 15)) = base_0; + } +} +static void unffor_1bw_16ow_16crw_1uf(const uint16_t* __restrict a_in_p, + uint16_t* __restrict a_out_p, + const uint16_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint16_t register_0; + [[maybe_unused]] uint16_t tmp_0; + [[maybe_unused]] uint16_t base_0 = *(a_base_p); + for (int i = 0; i < 64; ++i) { + register_0 = *(in + (0 * 64) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 0)) = tmp_0; + tmp_0 = (register_0 >> 1) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 1)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 2)) = tmp_0; + tmp_0 = (register_0 >> 3) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 3)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 4)) = tmp_0; + tmp_0 = (register_0 >> 5) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 5)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 6)) = tmp_0; + tmp_0 = (register_0 >> 7) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 7)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 8)) = tmp_0; + tmp_0 = (register_0 >> 9) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 9)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 10)) = tmp_0; + tmp_0 = (register_0 >> 11) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 11)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 12)) = tmp_0; + tmp_0 = (register_0 >> 13) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 13)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 14)) = tmp_0; + tmp_0 = (register_0 >> 15) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 15)) = tmp_0; + } +} +static void unffor_2bw_16ow_16crw_1uf(const uint16_t* __restrict a_in_p, + uint16_t* __restrict a_out_p, + const uint16_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint16_t register_0; + [[maybe_unused]] uint16_t tmp_0; + [[maybe_unused]] uint16_t base_0 = *(a_base_p); + for (int i = 0; i < 64; ++i) { + register_0 = *(in + (0 * 64) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 0)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 1)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 2)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 3)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 4)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 5)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 6)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 7)) = tmp_0; + register_0 = *(in + (0 * 64) + (i * 1) + 64); + tmp_0 = (register_0) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 8)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 9)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 10)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 11)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 12)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 13)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 14)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 15)) = tmp_0; + } +} +static void unffor_3bw_16ow_16crw_1uf(const uint16_t* __restrict a_in_p, + uint16_t* __restrict a_out_p, + const uint16_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint16_t register_0; + [[maybe_unused]] uint16_t tmp_0; + [[maybe_unused]] uint16_t base_0 = *(a_base_p); + for (int i = 0; i < 64; ++i) { + register_0 = *(in + (0 * 64) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 0)) = tmp_0; + tmp_0 = (register_0 >> 3) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 1)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 2)) = tmp_0; + tmp_0 = (register_0 >> 9) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 3)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 4)) = tmp_0; + tmp_0 = (register_0 >> 15) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 1; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 5)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 6)) = tmp_0; + tmp_0 = (register_0 >> 5) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 7)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 8)) = tmp_0; + tmp_0 = (register_0 >> 11) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 9)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 10)) = tmp_0; + tmp_0 = (register_0 >> 1) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 11)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 12)) = tmp_0; + tmp_0 = (register_0 >> 7) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 13)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 14)) = tmp_0; + tmp_0 = (register_0 >> 13) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 15)) = tmp_0; + } +} +static void unffor_4bw_16ow_16crw_1uf(const uint16_t* __restrict a_in_p, + uint16_t* __restrict a_out_p, + const uint16_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint16_t register_0; + [[maybe_unused]] uint16_t tmp_0; + [[maybe_unused]] uint16_t base_0 = *(a_base_p); + for (int i = 0; i < 64; ++i) { + register_0 = *(in + (0 * 64) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 0)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 1)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 2)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 3)) = tmp_0; + register_0 = *(in + (0 * 64) + (i * 1) + 64); + tmp_0 = (register_0) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 4)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 5)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 6)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 7)) = tmp_0; + register_0 = *(in + (0 * 64) + (i * 1) + 128); + tmp_0 = (register_0) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 8)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 9)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 10)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 11)) = tmp_0; + register_0 = *(in + (0 * 64) + (i * 1) + 192); + tmp_0 = (register_0) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 12)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 13)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 14)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 15)) = tmp_0; + } +} +static void unffor_5bw_16ow_16crw_1uf(const uint16_t* __restrict a_in_p, + uint16_t* __restrict a_out_p, + const uint16_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint16_t register_0; + [[maybe_unused]] uint16_t tmp_0; + [[maybe_unused]] uint16_t base_0 = *(a_base_p); + for (int i = 0; i < 64; ++i) { + register_0 = *(in + (0 * 64) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 0)) = tmp_0; + tmp_0 = (register_0 >> 5) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 1)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 2)) = tmp_0; + tmp_0 = (register_0 >> 15) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 1; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 3)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 4)) = tmp_0; + tmp_0 = (register_0 >> 9) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 5)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 6)) = tmp_0; + tmp_0 = (register_0 >> 3) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 7)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 8)) = tmp_0; + tmp_0 = (register_0 >> 13) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 3; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 9)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 10)) = tmp_0; + tmp_0 = (register_0 >> 7) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 11)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 12)) = tmp_0; + tmp_0 = (register_0 >> 1) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 13)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 14)) = tmp_0; + tmp_0 = (register_0 >> 11) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 15)) = tmp_0; + } +} +static void unffor_6bw_16ow_16crw_1uf(const uint16_t* __restrict a_in_p, + uint16_t* __restrict a_out_p, + const uint16_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint16_t register_0; + [[maybe_unused]] uint16_t tmp_0; + [[maybe_unused]] uint16_t base_0 = *(a_base_p); + for (int i = 0; i < 64; ++i) { + register_0 = *(in + (0 * 64) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 0)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 1)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 2)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 3)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 4)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 5)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 6)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 7)) = tmp_0; + register_0 = *(in + (0 * 64) + (i * 1) + 192); + tmp_0 = (register_0) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 8)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 9)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 10)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 11)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 12)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 13)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 14)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 15)) = tmp_0; + } +} +static void unffor_7bw_16ow_16crw_1uf(const uint16_t* __restrict a_in_p, + uint16_t* __restrict a_out_p, + const uint16_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint16_t register_0; + [[maybe_unused]] uint16_t tmp_0; + [[maybe_unused]] uint16_t base_0 = *(a_base_p); + for (int i = 0; i < 64; ++i) { + register_0 = *(in + (0 * 64) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 0)) = tmp_0; + tmp_0 = (register_0 >> 7) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 1)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 2)) = tmp_0; + tmp_0 = (register_0 >> 5) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 3)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 4)) = tmp_0; + tmp_0 = (register_0 >> 3) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 5)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 6)) = tmp_0; + tmp_0 = (register_0 >> 1) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 7)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 8)) = tmp_0; + tmp_0 = (register_0 >> 15) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 1; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 9)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 10)) = tmp_0; + tmp_0 = (register_0 >> 13) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 3; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 11)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 12)) = tmp_0; + tmp_0 = (register_0 >> 11) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 5; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 13)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 14)) = tmp_0; + tmp_0 = (register_0 >> 9) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 15)) = tmp_0; + } +} +static void unffor_8bw_16ow_16crw_1uf(const uint16_t* __restrict a_in_p, + uint16_t* __restrict a_out_p, + const uint16_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint16_t register_0; + [[maybe_unused]] uint16_t tmp_0; + [[maybe_unused]] uint16_t base_0 = *(a_base_p); + for (int i = 0; i < 64; ++i) { + register_0 = *(in + (0 * 64) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 0)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 1)) = tmp_0; + register_0 = *(in + (0 * 64) + (i * 1) + 64); + tmp_0 = (register_0) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 2)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 3)) = tmp_0; + register_0 = *(in + (0 * 64) + (i * 1) + 128); + tmp_0 = (register_0) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 4)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 5)) = tmp_0; + register_0 = *(in + (0 * 64) + (i * 1) + 192); + tmp_0 = (register_0) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 6)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 7)) = tmp_0; + register_0 = *(in + (0 * 64) + (i * 1) + 256); + tmp_0 = (register_0) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 8)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 9)) = tmp_0; + register_0 = *(in + (0 * 64) + (i * 1) + 320); + tmp_0 = (register_0) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 10)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 11)) = tmp_0; + register_0 = *(in + (0 * 64) + (i * 1) + 384); + tmp_0 = (register_0) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 12)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 13)) = tmp_0; + register_0 = *(in + (0 * 64) + (i * 1) + 448); + tmp_0 = (register_0) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 14)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 15)) = tmp_0; + } +} +static void unffor_9bw_16ow_16crw_1uf(const uint16_t* __restrict a_in_p, + uint16_t* __restrict a_out_p, + const uint16_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint16_t register_0; + [[maybe_unused]] uint16_t tmp_0; + [[maybe_unused]] uint16_t base_0 = *(a_base_p); + for (int i = 0; i < 64; ++i) { + register_0 = *(in + (0 * 64) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 0)) = tmp_0; + tmp_0 = (register_0 >> 9) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 7; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 1)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 2)) = tmp_0; + tmp_0 = (register_0 >> 11) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 5; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 3)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 4)) = tmp_0; + tmp_0 = (register_0 >> 13) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 3; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 5)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 6)) = tmp_0; + tmp_0 = (register_0 >> 15) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 1; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 7)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 8)) = tmp_0; + tmp_0 = (register_0 >> 1) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 9)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 10)) = tmp_0; + tmp_0 = (register_0 >> 3) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 11)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 12)) = tmp_0; + tmp_0 = (register_0 >> 5) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 13)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 14)) = tmp_0; + tmp_0 = (register_0 >> 7) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 15)) = tmp_0; + } +} +static void unffor_10bw_16ow_16crw_1uf(const uint16_t* __restrict a_in_p, + uint16_t* __restrict a_out_p, + const uint16_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint16_t register_0; + [[maybe_unused]] uint16_t tmp_0; + [[maybe_unused]] uint16_t base_0 = *(a_base_p); + for (int i = 0; i < 64; ++i) { + register_0 = *(in + (0 * 64) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 0)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 1)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 2)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 3)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 4)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 5)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 6)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 7)) = tmp_0; + register_0 = *(in + (0 * 64) + (i * 1) + 320); + tmp_0 = (register_0) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 8)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 9)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 10)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 11)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 12)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 13)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 14)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 15)) = tmp_0; + } +} +static void unffor_11bw_16ow_16crw_1uf(const uint16_t* __restrict a_in_p, + uint16_t* __restrict a_out_p, + const uint16_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint16_t register_0; + [[maybe_unused]] uint16_t tmp_0; + [[maybe_unused]] uint16_t base_0 = *(a_base_p); + for (int i = 0; i < 64; ++i) { + register_0 = *(in + (0 * 64) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 0)) = tmp_0; + tmp_0 = (register_0 >> 11) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 5; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 1)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 2)) = tmp_0; + tmp_0 = (register_0 >> 1) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 3)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 4)) = tmp_0; + tmp_0 = (register_0 >> 7) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 9; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 5)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 6)) = tmp_0; + tmp_0 = (register_0 >> 13) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 3; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 7)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 8)) = tmp_0; + tmp_0 = (register_0 >> 3) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 9)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 10)) = tmp_0; + tmp_0 = (register_0 >> 9) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 7; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 11)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 12)) = tmp_0; + tmp_0 = (register_0 >> 15) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 1; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 13)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 14)) = tmp_0; + tmp_0 = (register_0 >> 5) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 15)) = tmp_0; + } +} +static void unffor_12bw_16ow_16crw_1uf(const uint16_t* __restrict a_in_p, + uint16_t* __restrict a_out_p, + const uint16_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint16_t register_0; + [[maybe_unused]] uint16_t tmp_0; + [[maybe_unused]] uint16_t base_0 = *(a_base_p); + for (int i = 0; i < 64; ++i) { + register_0 = *(in + (0 * 64) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 0)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 1)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 2)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 3)) = tmp_0; + register_0 = *(in + (0 * 64) + (i * 1) + 192); + tmp_0 = (register_0) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 4)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 5)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 6)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 7)) = tmp_0; + register_0 = *(in + (0 * 64) + (i * 1) + 384); + tmp_0 = (register_0) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 8)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 9)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 10)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 11)) = tmp_0; + register_0 = *(in + (0 * 64) + (i * 1) + 576); + tmp_0 = (register_0) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 12)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 13)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 14)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 15)) = tmp_0; + } +} +static void unffor_13bw_16ow_16crw_1uf(const uint16_t* __restrict a_in_p, + uint16_t* __restrict a_out_p, + const uint16_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint16_t register_0; + [[maybe_unused]] uint16_t tmp_0; + [[maybe_unused]] uint16_t base_0 = *(a_base_p); + for (int i = 0; i < 64; ++i) { + register_0 = *(in + (0 * 64) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 0)) = tmp_0; + tmp_0 = (register_0 >> 13) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 3; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 1)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 2)) = tmp_0; + tmp_0 = (register_0 >> 7) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 9; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 3)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 4)) = tmp_0; + tmp_0 = (register_0 >> 1) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 5)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 6)) = tmp_0; + tmp_0 = (register_0 >> 11) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 5; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 7)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 8)) = tmp_0; + tmp_0 = (register_0 >> 5) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 11; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 9)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 10)) = tmp_0; + tmp_0 = (register_0 >> 15) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 1; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 11)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 12)) = tmp_0; + tmp_0 = (register_0 >> 9) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 7; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 13)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 768); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 14)) = tmp_0; + tmp_0 = (register_0 >> 3) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 15)) = tmp_0; + } +} +static void unffor_14bw_16ow_16crw_1uf(const uint16_t* __restrict a_in_p, + uint16_t* __restrict a_out_p, + const uint16_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint16_t register_0; + [[maybe_unused]] uint16_t tmp_0; + [[maybe_unused]] uint16_t base_0 = *(a_base_p); + for (int i = 0; i < 64; ++i) { + register_0 = *(in + (0 * 64) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 0)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 1)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 2)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 3)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 4)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 5)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 6)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 7)) = tmp_0; + register_0 = *(in + (0 * 64) + (i * 1) + 448); + tmp_0 = (register_0) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 8)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 9)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 10)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 11)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 12)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 768); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 13)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 832); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 14)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 15)) = tmp_0; + } +} +static void unffor_15bw_16ow_16crw_1uf(const uint16_t* __restrict a_in_p, + uint16_t* __restrict a_out_p, + const uint16_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint16_t register_0; + [[maybe_unused]] uint16_t tmp_0; + [[maybe_unused]] uint16_t base_0 = *(a_base_p); + for (int i = 0; i < 64; ++i) { + register_0 = *(in + (0 * 64) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 15) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 0)) = tmp_0; + tmp_0 = (register_0 >> 15) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 1; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 1)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 2)) = tmp_0; + tmp_0 = (register_0 >> 13) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 3; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 3)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 4)) = tmp_0; + tmp_0 = (register_0 >> 11) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 5; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 5)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 6)) = tmp_0; + tmp_0 = (register_0 >> 9) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 7; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 7)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 8)) = tmp_0; + tmp_0 = (register_0 >> 7) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 9; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 9)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 10)) = tmp_0; + tmp_0 = (register_0 >> 5) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 11; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 11)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 768); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 12)) = tmp_0; + tmp_0 = (register_0 >> 3) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 832); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 13; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 13)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 64) + (i * 1) + 896); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 14; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 14)) = tmp_0; + tmp_0 = (register_0 >> 1) & ((1ULL << 15) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 15)) = tmp_0; + } +} +static void unffor_16bw_16ow_16crw_1uf(const uint16_t* __restrict a_in_p, + uint16_t* __restrict a_out_p, + const uint16_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint16_t register_0; + [[maybe_unused]] uint16_t tmp_0; + [[maybe_unused]] uint16_t base_0 = *(a_base_p); + for (int i = 0; i < 64; ++i) { + register_0 = *(in + (0 * 64) + (i * 1) + 0); + register_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 0)) = register_0; + register_0 = *(in + (0 * 64) + (i * 1) + 64); + register_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 1)) = register_0; + register_0 = *(in + (0 * 64) + (i * 1) + 128); + register_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 2)) = register_0; + register_0 = *(in + (0 * 64) + (i * 1) + 192); + register_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 3)) = register_0; + register_0 = *(in + (0 * 64) + (i * 1) + 256); + register_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 4)) = register_0; + register_0 = *(in + (0 * 64) + (i * 1) + 320); + register_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 5)) = register_0; + register_0 = *(in + (0 * 64) + (i * 1) + 384); + register_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 6)) = register_0; + register_0 = *(in + (0 * 64) + (i * 1) + 448); + register_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 7)) = register_0; + register_0 = *(in + (0 * 64) + (i * 1) + 512); + register_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 8)) = register_0; + register_0 = *(in + (0 * 64) + (i * 1) + 576); + register_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 9)) = register_0; + register_0 = *(in + (0 * 64) + (i * 1) + 640); + register_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 10)) = register_0; + register_0 = *(in + (0 * 64) + (i * 1) + 704); + register_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 11)) = register_0; + register_0 = *(in + (0 * 64) + (i * 1) + 768); + register_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 12)) = register_0; + register_0 = *(in + (0 * 64) + (i * 1) + 832); + register_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 13)) = register_0; + register_0 = *(in + (0 * 64) + (i * 1) + 896); + register_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 14)) = register_0; + register_0 = *(in + (0 * 64) + (i * 1) + 960); + register_0 += base_0; + *(out + (i * 1) + (0 * 64) + (64 * 15)) = register_0; + } +} +static void unffor_0bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, + uint32_t* __restrict a_out_p, + const uint32_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint32_t register_0; + [[maybe_unused]] uint32_t tmp_0; + [[maybe_unused]] uint32_t base_0 = *(a_base_p); + for (int i = 0; i < 32; ++i) { + *(out + (i * 1) + (0 * 32) + (32 * 0)) = base_0; + *(out + (i * 1) + (0 * 32) + (32 * 1)) = base_0; + *(out + (i * 1) + (0 * 32) + (32 * 2)) = base_0; + *(out + (i * 1) + (0 * 32) + (32 * 3)) = base_0; + *(out + (i * 1) + (0 * 32) + (32 * 4)) = base_0; + *(out + (i * 1) + (0 * 32) + (32 * 5)) = base_0; + *(out + (i * 1) + (0 * 32) + (32 * 6)) = base_0; + *(out + (i * 1) + (0 * 32) + (32 * 7)) = base_0; + *(out + (i * 1) + (0 * 32) + (32 * 8)) = base_0; + *(out + (i * 1) + (0 * 32) + (32 * 9)) = base_0; + *(out + (i * 1) + (0 * 32) + (32 * 10)) = base_0; + *(out + (i * 1) + (0 * 32) + (32 * 11)) = base_0; + *(out + (i * 1) + (0 * 32) + (32 * 12)) = base_0; + *(out + (i * 1) + (0 * 32) + (32 * 13)) = base_0; + *(out + (i * 1) + (0 * 32) + (32 * 14)) = base_0; + *(out + (i * 1) + (0 * 32) + (32 * 15)) = base_0; + *(out + (i * 1) + (0 * 32) + (32 * 16)) = base_0; + *(out + (i * 1) + (0 * 32) + (32 * 17)) = base_0; + *(out + (i * 1) + (0 * 32) + (32 * 18)) = base_0; + *(out + (i * 1) + (0 * 32) + (32 * 19)) = base_0; + *(out + (i * 1) + (0 * 32) + (32 * 20)) = base_0; + *(out + (i * 1) + (0 * 32) + (32 * 21)) = base_0; + *(out + (i * 1) + (0 * 32) + (32 * 22)) = base_0; + *(out + (i * 1) + (0 * 32) + (32 * 23)) = base_0; + *(out + (i * 1) + (0 * 32) + (32 * 24)) = base_0; + *(out + (i * 1) + (0 * 32) + (32 * 25)) = base_0; + *(out + (i * 1) + (0 * 32) + (32 * 26)) = base_0; + *(out + (i * 1) + (0 * 32) + (32 * 27)) = base_0; + *(out + (i * 1) + (0 * 32) + (32 * 28)) = base_0; + *(out + (i * 1) + (0 * 32) + (32 * 29)) = base_0; + *(out + (i * 1) + (0 * 32) + (32 * 30)) = base_0; + *(out + (i * 1) + (0 * 32) + (32 * 31)) = base_0; + } +} +static void unffor_1bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, + uint32_t* __restrict a_out_p, + const uint32_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint32_t register_0; + [[maybe_unused]] uint32_t tmp_0; + [[maybe_unused]] uint32_t base_0 = *(a_base_p); + for (int i = 0; i < 32; ++i) { + register_0 = *(in + (0 * 32) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 0)) = tmp_0; + tmp_0 = (register_0 >> 1) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 1)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 2)) = tmp_0; + tmp_0 = (register_0 >> 3) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 3)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 4)) = tmp_0; + tmp_0 = (register_0 >> 5) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 5)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 6)) = tmp_0; + tmp_0 = (register_0 >> 7) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 7)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 8)) = tmp_0; + tmp_0 = (register_0 >> 9) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 9)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 10)) = tmp_0; + tmp_0 = (register_0 >> 11) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 11)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 12)) = tmp_0; + tmp_0 = (register_0 >> 13) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 13)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 14)) = tmp_0; + tmp_0 = (register_0 >> 15) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 15)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 16)) = tmp_0; + tmp_0 = (register_0 >> 17) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 17)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 18)) = tmp_0; + tmp_0 = (register_0 >> 19) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 19)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 20)) = tmp_0; + tmp_0 = (register_0 >> 21) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 21)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 22)) = tmp_0; + tmp_0 = (register_0 >> 23) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 23)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 24)) = tmp_0; + tmp_0 = (register_0 >> 25) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 25)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 26)) = tmp_0; + tmp_0 = (register_0 >> 27) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 27)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 28)) = tmp_0; + tmp_0 = (register_0 >> 29) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 29)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 30)) = tmp_0; + tmp_0 = (register_0 >> 31) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 31)) = tmp_0; + } +} +static void unffor_2bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, + uint32_t* __restrict a_out_p, + const uint32_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint32_t register_0; + [[maybe_unused]] uint32_t tmp_0; + [[maybe_unused]] uint32_t base_0 = *(a_base_p); + for (int i = 0; i < 32; ++i) { + register_0 = *(in + (0 * 32) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 0)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 1)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 2)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 3)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 4)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 5)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 6)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 7)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 8)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 9)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 10)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 11)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 12)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 13)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 14)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 15)) = tmp_0; + register_0 = *(in + (0 * 32) + (i * 1) + 32); + tmp_0 = (register_0) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 16)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 17)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 18)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 19)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 20)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 21)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 22)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 23)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 24)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 25)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 26)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 27)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 28)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 29)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 30)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 31)) = tmp_0; + } +} +static void unffor_3bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, + uint32_t* __restrict a_out_p, + const uint32_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint32_t register_0; + [[maybe_unused]] uint32_t tmp_0; + [[maybe_unused]] uint32_t base_0 = *(a_base_p); + for (int i = 0; i < 32; ++i) { + register_0 = *(in + (0 * 32) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 0)) = tmp_0; + tmp_0 = (register_0 >> 3) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 1)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 2)) = tmp_0; + tmp_0 = (register_0 >> 9) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 3)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 4)) = tmp_0; + tmp_0 = (register_0 >> 15) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 5)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 6)) = tmp_0; + tmp_0 = (register_0 >> 21) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 7)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 8)) = tmp_0; + tmp_0 = (register_0 >> 27) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 9)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 10)) = tmp_0; + tmp_0 = (register_0 >> 1) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 11)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 12)) = tmp_0; + tmp_0 = (register_0 >> 7) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 13)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 14)) = tmp_0; + tmp_0 = (register_0 >> 13) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 15)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 16)) = tmp_0; + tmp_0 = (register_0 >> 19) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 17)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 18)) = tmp_0; + tmp_0 = (register_0 >> 25) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 19)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 20)) = tmp_0; + tmp_0 = (register_0 >> 31) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 1; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 21)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 22)) = tmp_0; + tmp_0 = (register_0 >> 5) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 23)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 24)) = tmp_0; + tmp_0 = (register_0 >> 11) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 25)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 26)) = tmp_0; + tmp_0 = (register_0 >> 17) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 27)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 28)) = tmp_0; + tmp_0 = (register_0 >> 23) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 29)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 30)) = tmp_0; + tmp_0 = (register_0 >> 29) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 31)) = tmp_0; + } +} +static void unffor_4bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, + uint32_t* __restrict a_out_p, + const uint32_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint32_t register_0; + [[maybe_unused]] uint32_t tmp_0; + [[maybe_unused]] uint32_t base_0 = *(a_base_p); + for (int i = 0; i < 32; ++i) { + register_0 = *(in + (0 * 32) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 0)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 1)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 2)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 3)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 4)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 5)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 6)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 7)) = tmp_0; + register_0 = *(in + (0 * 32) + (i * 1) + 32); + tmp_0 = (register_0) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 8)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 9)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 10)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 11)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 12)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 13)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 14)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 15)) = tmp_0; + register_0 = *(in + (0 * 32) + (i * 1) + 64); + tmp_0 = (register_0) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 16)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 17)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 18)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 19)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 20)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 21)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 22)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 23)) = tmp_0; + register_0 = *(in + (0 * 32) + (i * 1) + 96); + tmp_0 = (register_0) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 24)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 25)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 26)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 27)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 28)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 29)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 30)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 31)) = tmp_0; + } +} +static void unffor_5bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, + uint32_t* __restrict a_out_p, + const uint32_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint32_t register_0; + [[maybe_unused]] uint32_t tmp_0; + [[maybe_unused]] uint32_t base_0 = *(a_base_p); + for (int i = 0; i < 32; ++i) { + register_0 = *(in + (0 * 32) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 0)) = tmp_0; + tmp_0 = (register_0 >> 5) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 1)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 2)) = tmp_0; + tmp_0 = (register_0 >> 15) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 3)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 4)) = tmp_0; + tmp_0 = (register_0 >> 25) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 5)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 6)) = tmp_0; + tmp_0 = (register_0 >> 3) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 7)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 8)) = tmp_0; + tmp_0 = (register_0 >> 13) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 9)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 10)) = tmp_0; + tmp_0 = (register_0 >> 23) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 11)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 12)) = tmp_0; + tmp_0 = (register_0 >> 1) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 13)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 14)) = tmp_0; + tmp_0 = (register_0 >> 11) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 15)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 16)) = tmp_0; + tmp_0 = (register_0 >> 21) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 17)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 18)) = tmp_0; + tmp_0 = (register_0 >> 31) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 1; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 19)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 20)) = tmp_0; + tmp_0 = (register_0 >> 9) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 21)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 22)) = tmp_0; + tmp_0 = (register_0 >> 19) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 23)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 24)) = tmp_0; + tmp_0 = (register_0 >> 29) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 3; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 25)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 26)) = tmp_0; + tmp_0 = (register_0 >> 7) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 27)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 28)) = tmp_0; + tmp_0 = (register_0 >> 17) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 29)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 30)) = tmp_0; + tmp_0 = (register_0 >> 27) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 31)) = tmp_0; + } +} +static void unffor_6bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, + uint32_t* __restrict a_out_p, + const uint32_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint32_t register_0; + [[maybe_unused]] uint32_t tmp_0; + [[maybe_unused]] uint32_t base_0 = *(a_base_p); + for (int i = 0; i < 32; ++i) { + register_0 = *(in + (0 * 32) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 0)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 1)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 2)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 3)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 4)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 5)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 6)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 7)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 8)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 9)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 10)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 11)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 12)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 13)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 14)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 15)) = tmp_0; + register_0 = *(in + (0 * 32) + (i * 1) + 96); + tmp_0 = (register_0) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 16)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 17)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 18)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 19)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 20)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 21)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 22)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 23)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 24)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 25)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 26)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 27)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 28)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 29)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 30)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 31)) = tmp_0; + } +} +static void unffor_7bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, + uint32_t* __restrict a_out_p, + const uint32_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint32_t register_0; + [[maybe_unused]] uint32_t tmp_0; + [[maybe_unused]] uint32_t base_0 = *(a_base_p); + for (int i = 0; i < 32; ++i) { + register_0 = *(in + (0 * 32) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 0)) = tmp_0; + tmp_0 = (register_0 >> 7) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 1)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 2)) = tmp_0; + tmp_0 = (register_0 >> 21) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 3)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 4)) = tmp_0; + tmp_0 = (register_0 >> 3) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 5)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 6)) = tmp_0; + tmp_0 = (register_0 >> 17) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 7)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 8)) = tmp_0; + tmp_0 = (register_0 >> 31) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 1; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 9)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 10)) = tmp_0; + tmp_0 = (register_0 >> 13) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 11)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 12)) = tmp_0; + tmp_0 = (register_0 >> 27) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 5; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 13)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 14)) = tmp_0; + tmp_0 = (register_0 >> 9) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 15)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 16)) = tmp_0; + tmp_0 = (register_0 >> 23) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 17)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 18)) = tmp_0; + tmp_0 = (register_0 >> 5) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 19)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 20)) = tmp_0; + tmp_0 = (register_0 >> 19) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 21)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 22)) = tmp_0; + tmp_0 = (register_0 >> 1) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 23)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 24)) = tmp_0; + tmp_0 = (register_0 >> 15) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 25)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 26)) = tmp_0; + tmp_0 = (register_0 >> 29) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 3; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 27)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 28)) = tmp_0; + tmp_0 = (register_0 >> 11) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 29)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 30)) = tmp_0; + tmp_0 = (register_0 >> 25) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 31)) = tmp_0; + } +} +static void unffor_8bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, + uint32_t* __restrict a_out_p, + const uint32_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint32_t register_0; + [[maybe_unused]] uint32_t tmp_0; + [[maybe_unused]] uint32_t base_0 = *(a_base_p); + for (int i = 0; i < 32; ++i) { + register_0 = *(in + (0 * 32) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 0)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 1)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 2)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 3)) = tmp_0; + register_0 = *(in + (0 * 32) + (i * 1) + 32); + tmp_0 = (register_0) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 4)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 5)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 6)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 7)) = tmp_0; + register_0 = *(in + (0 * 32) + (i * 1) + 64); + tmp_0 = (register_0) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 8)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 9)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 10)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 11)) = tmp_0; + register_0 = *(in + (0 * 32) + (i * 1) + 96); + tmp_0 = (register_0) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 12)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 13)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 14)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 15)) = tmp_0; + register_0 = *(in + (0 * 32) + (i * 1) + 128); + tmp_0 = (register_0) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 16)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 17)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 18)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 19)) = tmp_0; + register_0 = *(in + (0 * 32) + (i * 1) + 160); + tmp_0 = (register_0) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 20)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 21)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 22)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 23)) = tmp_0; + register_0 = *(in + (0 * 32) + (i * 1) + 192); + tmp_0 = (register_0) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 24)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 25)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 26)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 27)) = tmp_0; + register_0 = *(in + (0 * 32) + (i * 1) + 224); + tmp_0 = (register_0) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 28)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 29)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 30)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 31)) = tmp_0; + } +} +static void unffor_9bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, + uint32_t* __restrict a_out_p, + const uint32_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint32_t register_0; + [[maybe_unused]] uint32_t tmp_0; + [[maybe_unused]] uint32_t base_0 = *(a_base_p); + for (int i = 0; i < 32; ++i) { + register_0 = *(in + (0 * 32) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 0)) = tmp_0; + tmp_0 = (register_0 >> 9) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 1)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 2)) = tmp_0; + tmp_0 = (register_0 >> 27) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 5; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 3)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 4)) = tmp_0; + tmp_0 = (register_0 >> 13) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 5)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 6)) = tmp_0; + tmp_0 = (register_0 >> 31) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 1; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 7)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 8)) = tmp_0; + tmp_0 = (register_0 >> 17) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 9)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 10)) = tmp_0; + tmp_0 = (register_0 >> 3) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 11)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 12)) = tmp_0; + tmp_0 = (register_0 >> 21) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 13)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 14)) = tmp_0; + tmp_0 = (register_0 >> 7) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 15)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 16)) = tmp_0; + tmp_0 = (register_0 >> 25) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 7; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 17)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 18)) = tmp_0; + tmp_0 = (register_0 >> 11) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 19)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 20)) = tmp_0; + tmp_0 = (register_0 >> 29) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 3; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 21)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 22)) = tmp_0; + tmp_0 = (register_0 >> 15) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 23)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 24)) = tmp_0; + tmp_0 = (register_0 >> 1) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 25)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 26)) = tmp_0; + tmp_0 = (register_0 >> 19) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 27)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 28)) = tmp_0; + tmp_0 = (register_0 >> 5) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 29)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 30)) = tmp_0; + tmp_0 = (register_0 >> 23) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 31)) = tmp_0; + } +} +static void unffor_10bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, + uint32_t* __restrict a_out_p, + const uint32_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint32_t register_0; + [[maybe_unused]] uint32_t tmp_0; + [[maybe_unused]] uint32_t base_0 = *(a_base_p); + for (int i = 0; i < 32; ++i) { + register_0 = *(in + (0 * 32) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 0)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 1)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 2)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 3)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 4)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 5)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 6)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 7)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 8)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 9)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 10)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 11)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 12)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 13)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 14)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 15)) = tmp_0; + register_0 = *(in + (0 * 32) + (i * 1) + 160); + tmp_0 = (register_0) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 16)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 17)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 18)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 19)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 20)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 21)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 22)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 23)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 24)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 25)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 26)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 27)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 28)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 29)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 30)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 31)) = tmp_0; + } +} +static void unffor_11bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, + uint32_t* __restrict a_out_p, + const uint32_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint32_t register_0; + [[maybe_unused]] uint32_t tmp_0; + [[maybe_unused]] uint32_t base_0 = *(a_base_p); + for (int i = 0; i < 32; ++i) { + register_0 = *(in + (0 * 32) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 0)) = tmp_0; + tmp_0 = (register_0 >> 11) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 1)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 2)) = tmp_0; + tmp_0 = (register_0 >> 1) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 3)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 4)) = tmp_0; + tmp_0 = (register_0 >> 23) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 9; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 5)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 6)) = tmp_0; + tmp_0 = (register_0 >> 13) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 7)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 8)) = tmp_0; + tmp_0 = (register_0 >> 3) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 9)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 10)) = tmp_0; + tmp_0 = (register_0 >> 25) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 7; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 11)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 12)) = tmp_0; + tmp_0 = (register_0 >> 15) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 13)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 14)) = tmp_0; + tmp_0 = (register_0 >> 5) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 15)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 16)) = tmp_0; + tmp_0 = (register_0 >> 27) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 5; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 17)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 18)) = tmp_0; + tmp_0 = (register_0 >> 17) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 19)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 20)) = tmp_0; + tmp_0 = (register_0 >> 7) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 21)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 22)) = tmp_0; + tmp_0 = (register_0 >> 29) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 3; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 23)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 24)) = tmp_0; + tmp_0 = (register_0 >> 19) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 25)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 26)) = tmp_0; + tmp_0 = (register_0 >> 9) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 27)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 28)) = tmp_0; + tmp_0 = (register_0 >> 31) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 1; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 29)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 30)) = tmp_0; + tmp_0 = (register_0 >> 21) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 31)) = tmp_0; + } +} +static void unffor_12bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, + uint32_t* __restrict a_out_p, + const uint32_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint32_t register_0; + [[maybe_unused]] uint32_t tmp_0; + [[maybe_unused]] uint32_t base_0 = *(a_base_p); + for (int i = 0; i < 32; ++i) { + register_0 = *(in + (0 * 32) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 0)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 1)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 2)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 3)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 4)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 5)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 6)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 7)) = tmp_0; + register_0 = *(in + (0 * 32) + (i * 1) + 96); + tmp_0 = (register_0) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 8)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 9)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 10)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 11)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 12)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 13)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 14)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 15)) = tmp_0; + register_0 = *(in + (0 * 32) + (i * 1) + 192); + tmp_0 = (register_0) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 16)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 17)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 18)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 19)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 20)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 21)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 22)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 23)) = tmp_0; + register_0 = *(in + (0 * 32) + (i * 1) + 288); + tmp_0 = (register_0) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 24)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 25)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 26)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 27)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 28)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 29)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 30)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 31)) = tmp_0; + } +} +static void unffor_13bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, + uint32_t* __restrict a_out_p, + const uint32_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint32_t register_0; + [[maybe_unused]] uint32_t tmp_0; + [[maybe_unused]] uint32_t base_0 = *(a_base_p); + for (int i = 0; i < 32; ++i) { + register_0 = *(in + (0 * 32) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 0)) = tmp_0; + tmp_0 = (register_0 >> 13) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 1)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 2)) = tmp_0; + tmp_0 = (register_0 >> 7) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 3)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 4)) = tmp_0; + tmp_0 = (register_0 >> 1) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 5)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 6)) = tmp_0; + tmp_0 = (register_0 >> 27) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 5; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 7)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 8)) = tmp_0; + tmp_0 = (register_0 >> 21) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 11; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 9)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 10)) = tmp_0; + tmp_0 = (register_0 >> 15) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 11)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 12)) = tmp_0; + tmp_0 = (register_0 >> 9) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 13)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 14)) = tmp_0; + tmp_0 = (register_0 >> 3) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 15)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 16)) = tmp_0; + tmp_0 = (register_0 >> 29) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 3; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 17)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 18)) = tmp_0; + tmp_0 = (register_0 >> 23) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 9; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 19)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 20)) = tmp_0; + tmp_0 = (register_0 >> 17) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 21)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 22)) = tmp_0; + tmp_0 = (register_0 >> 11) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 23)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 24)) = tmp_0; + tmp_0 = (register_0 >> 5) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 25)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 26)) = tmp_0; + tmp_0 = (register_0 >> 31) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 1; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 27)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 28)) = tmp_0; + tmp_0 = (register_0 >> 25) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 7; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 29)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 30)) = tmp_0; + tmp_0 = (register_0 >> 19) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 31)) = tmp_0; + } +} +static void unffor_14bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, + uint32_t* __restrict a_out_p, + const uint32_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint32_t register_0; + [[maybe_unused]] uint32_t tmp_0; + [[maybe_unused]] uint32_t base_0 = *(a_base_p); + for (int i = 0; i < 32; ++i) { + register_0 = *(in + (0 * 32) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 0)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 1)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 2)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 3)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 4)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 5)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 6)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 7)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 8)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 9)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 10)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 11)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 12)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 13)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 14)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 15)) = tmp_0; + register_0 = *(in + (0 * 32) + (i * 1) + 224); + tmp_0 = (register_0) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 16)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 17)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 18)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 19)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 20)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 21)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 22)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 23)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 24)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 25)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 26)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 27)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 28)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 29)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 30)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 31)) = tmp_0; + } +} +static void unffor_15bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, + uint32_t* __restrict a_out_p, + const uint32_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint32_t register_0; + [[maybe_unused]] uint32_t tmp_0; + [[maybe_unused]] uint32_t base_0 = *(a_base_p); + for (int i = 0; i < 32; ++i) { + register_0 = *(in + (0 * 32) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 15) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 0)) = tmp_0; + tmp_0 = (register_0 >> 15) & ((1ULL << 15) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 1)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 2)) = tmp_0; + tmp_0 = (register_0 >> 13) & ((1ULL << 15) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 3)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 4)) = tmp_0; + tmp_0 = (register_0 >> 11) & ((1ULL << 15) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 5)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 6)) = tmp_0; + tmp_0 = (register_0 >> 9) & ((1ULL << 15) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 7)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 8)) = tmp_0; + tmp_0 = (register_0 >> 7) & ((1ULL << 15) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 9)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 10)) = tmp_0; + tmp_0 = (register_0 >> 5) & ((1ULL << 15) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 11)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 12)) = tmp_0; + tmp_0 = (register_0 >> 3) & ((1ULL << 15) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 13)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 14; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 14)) = tmp_0; + tmp_0 = (register_0 >> 1) & ((1ULL << 15) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 15)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 15) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 16)) = tmp_0; + tmp_0 = (register_0 >> 31) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 1; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 17)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 15) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 18)) = tmp_0; + tmp_0 = (register_0 >> 29) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 3; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 19)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 15) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 20)) = tmp_0; + tmp_0 = (register_0 >> 27) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 5; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 21)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 15) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 22)) = tmp_0; + tmp_0 = (register_0 >> 25) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 7; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 23)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 15) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 24)) = tmp_0; + tmp_0 = (register_0 >> 23) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 9; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 25)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 15) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 26)) = tmp_0; + tmp_0 = (register_0 >> 21) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 11; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 27)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 15) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 28)) = tmp_0; + tmp_0 = (register_0 >> 19) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 13; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 29)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 15) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 30)) = tmp_0; + tmp_0 = (register_0 >> 17) & ((1ULL << 15) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 31)) = tmp_0; + } +} +static void unffor_16bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, + uint32_t* __restrict a_out_p, + const uint32_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint32_t register_0; + [[maybe_unused]] uint32_t tmp_0; + [[maybe_unused]] uint32_t base_0 = *(a_base_p); + for (int i = 0; i < 32; ++i) { + register_0 = *(in + (0 * 32) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 0)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 1)) = tmp_0; + register_0 = *(in + (0 * 32) + (i * 1) + 32); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 2)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 3)) = tmp_0; + register_0 = *(in + (0 * 32) + (i * 1) + 64); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 4)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 5)) = tmp_0; + register_0 = *(in + (0 * 32) + (i * 1) + 96); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 6)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 7)) = tmp_0; + register_0 = *(in + (0 * 32) + (i * 1) + 128); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 8)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 9)) = tmp_0; + register_0 = *(in + (0 * 32) + (i * 1) + 160); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 10)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 11)) = tmp_0; + register_0 = *(in + (0 * 32) + (i * 1) + 192); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 12)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 13)) = tmp_0; + register_0 = *(in + (0 * 32) + (i * 1) + 224); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 14)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 15)) = tmp_0; + register_0 = *(in + (0 * 32) + (i * 1) + 256); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 16)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 17)) = tmp_0; + register_0 = *(in + (0 * 32) + (i * 1) + 288); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 18)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 19)) = tmp_0; + register_0 = *(in + (0 * 32) + (i * 1) + 320); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 20)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 21)) = tmp_0; + register_0 = *(in + (0 * 32) + (i * 1) + 352); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 22)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 23)) = tmp_0; + register_0 = *(in + (0 * 32) + (i * 1) + 384); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 24)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 25)) = tmp_0; + register_0 = *(in + (0 * 32) + (i * 1) + 416); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 26)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 27)) = tmp_0; + register_0 = *(in + (0 * 32) + (i * 1) + 448); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 28)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 29)) = tmp_0; + register_0 = *(in + (0 * 32) + (i * 1) + 480); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 30)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 31)) = tmp_0; + } +} +static void unffor_17bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, + uint32_t* __restrict a_out_p, + const uint32_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint32_t register_0; + [[maybe_unused]] uint32_t tmp_0; + [[maybe_unused]] uint32_t base_0 = *(a_base_p); + for (int i = 0; i < 32; ++i) { + register_0 = *(in + (0 * 32) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 17) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 0)) = tmp_0; + tmp_0 = (register_0 >> 17) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 15; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 1)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 17) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 2)) = tmp_0; + tmp_0 = (register_0 >> 19) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 13; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 3)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 17) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 4)) = tmp_0; + tmp_0 = (register_0 >> 21) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 11; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 5)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 17) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 6)) = tmp_0; + tmp_0 = (register_0 >> 23) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 9; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 7)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 17) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 8)) = tmp_0; + tmp_0 = (register_0 >> 25) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 7; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 9)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 17) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 10)) = tmp_0; + tmp_0 = (register_0 >> 27) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 5; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 11)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 17) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 12)) = tmp_0; + tmp_0 = (register_0 >> 29) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 3; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 13)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 17) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 14)) = tmp_0; + tmp_0 = (register_0 >> 31) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 1; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 15)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 16)) = tmp_0; + tmp_0 = (register_0 >> 1) & ((1ULL << 17) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 17)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 14; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 18)) = tmp_0; + tmp_0 = (register_0 >> 3) & ((1ULL << 17) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 19)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 20)) = tmp_0; + tmp_0 = (register_0 >> 5) & ((1ULL << 17) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 21)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 22)) = tmp_0; + tmp_0 = (register_0 >> 7) & ((1ULL << 17) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 23)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 24)) = tmp_0; + tmp_0 = (register_0 >> 9) & ((1ULL << 17) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 25)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 26)) = tmp_0; + tmp_0 = (register_0 >> 11) & ((1ULL << 17) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 27)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 28)) = tmp_0; + tmp_0 = (register_0 >> 13) & ((1ULL << 17) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 29)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 30)) = tmp_0; + tmp_0 = (register_0 >> 15) & ((1ULL << 17) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 31)) = tmp_0; + } +} +static void unffor_18bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, + uint32_t* __restrict a_out_p, + const uint32_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint32_t register_0; + [[maybe_unused]] uint32_t tmp_0; + [[maybe_unused]] uint32_t base_0 = *(a_base_p); + for (int i = 0; i < 32; ++i) { + register_0 = *(in + (0 * 32) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 18) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 0)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 14; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 1)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 18) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 2)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 3)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 18) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 4)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 5)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 18) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 6)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 7)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 8)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 18) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 9)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 10)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 18) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 11)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 12)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 18) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 13)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 14)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 18) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 15)) = tmp_0; + register_0 = *(in + (0 * 32) + (i * 1) + 288); + tmp_0 = (register_0) & ((1ULL << 18) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 16)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 14; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 17)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 18) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 18)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 19)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 18) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 20)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 21)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 18) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 22)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 23)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 24)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 18) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 25)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 26)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 18) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 27)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 28)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 18) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 29)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 30)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 18) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 31)) = tmp_0; + } +} +static void unffor_19bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, + uint32_t* __restrict a_out_p, + const uint32_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint32_t register_0; + [[maybe_unused]] uint32_t tmp_0; + [[maybe_unused]] uint32_t base_0 = *(a_base_p); + for (int i = 0; i < 32; ++i) { + register_0 = *(in + (0 * 32) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 19) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 0)) = tmp_0; + tmp_0 = (register_0 >> 19) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 13; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 1)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 19) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 2)) = tmp_0; + tmp_0 = (register_0 >> 25) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 7; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 3)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 19) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 4)) = tmp_0; + tmp_0 = (register_0 >> 31) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 1; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 5)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 14; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 6)) = tmp_0; + tmp_0 = (register_0 >> 5) & ((1ULL << 19) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 7)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 8)) = tmp_0; + tmp_0 = (register_0 >> 11) & ((1ULL << 19) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 9)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 10)) = tmp_0; + tmp_0 = (register_0 >> 17) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 15; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 11)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 19) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 12)) = tmp_0; + tmp_0 = (register_0 >> 23) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 9; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 13)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 19) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 14)) = tmp_0; + tmp_0 = (register_0 >> 29) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 3; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 15)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 16)) = tmp_0; + tmp_0 = (register_0 >> 3) & ((1ULL << 19) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 17)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 18)) = tmp_0; + tmp_0 = (register_0 >> 9) & ((1ULL << 19) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 19)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 20)) = tmp_0; + tmp_0 = (register_0 >> 15) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 17; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 21)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 19) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 22)) = tmp_0; + tmp_0 = (register_0 >> 21) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 11; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 23)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 19) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 24)) = tmp_0; + tmp_0 = (register_0 >> 27) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 5; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 25)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 18; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 26)) = tmp_0; + tmp_0 = (register_0 >> 1) & ((1ULL << 19) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 27)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 28)) = tmp_0; + tmp_0 = (register_0 >> 7) & ((1ULL << 19) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 29)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 30)) = tmp_0; + tmp_0 = (register_0 >> 13) & ((1ULL << 19) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 31)) = tmp_0; + } +} +static void unffor_20bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, + uint32_t* __restrict a_out_p, + const uint32_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint32_t register_0; + [[maybe_unused]] uint32_t tmp_0; + [[maybe_unused]] uint32_t base_0 = *(a_base_p); + for (int i = 0; i < 32; ++i) { + register_0 = *(in + (0 * 32) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 20) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 0)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 1)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 20) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 2)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 3)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 4)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 20) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 5)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 6)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 20) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 7)) = tmp_0; + register_0 = *(in + (0 * 32) + (i * 1) + 160); + tmp_0 = (register_0) & ((1ULL << 20) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 8)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 9)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 20) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 10)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 11)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 12)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 20) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 13)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 14)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 20) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 15)) = tmp_0; + register_0 = *(in + (0 * 32) + (i * 1) + 320); + tmp_0 = (register_0) & ((1ULL << 20) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 16)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 17)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 20) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 18)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 19)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 20)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 20) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 21)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 22)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 20) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 23)) = tmp_0; + register_0 = *(in + (0 * 32) + (i * 1) + 480); + tmp_0 = (register_0) & ((1ULL << 20) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 24)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 25)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 20) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 26)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 27)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 28)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 20) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 29)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 30)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 20) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 31)) = tmp_0; + } +} +static void unffor_21bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, + uint32_t* __restrict a_out_p, + const uint32_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint32_t register_0; + [[maybe_unused]] uint32_t tmp_0; + [[maybe_unused]] uint32_t base_0 = *(a_base_p); + for (int i = 0; i < 32; ++i) { + register_0 = *(in + (0 * 32) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 21) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 0)) = tmp_0; + tmp_0 = (register_0 >> 21) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 11; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 1)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 21) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 2)) = tmp_0; + tmp_0 = (register_0 >> 31) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 1; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 3)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 4)) = tmp_0; + tmp_0 = (register_0 >> 9) & ((1ULL << 21) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 5)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 6)) = tmp_0; + tmp_0 = (register_0 >> 19) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 13; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 7)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 21) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 8)) = tmp_0; + tmp_0 = (register_0 >> 29) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 3; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 9)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 14; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 10)) = tmp_0; + tmp_0 = (register_0 >> 7) & ((1ULL << 21) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 11)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 12)) = tmp_0; + tmp_0 = (register_0 >> 17) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 15; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 13)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 21) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 14)) = tmp_0; + tmp_0 = (register_0 >> 27) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 5; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 15)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 16)) = tmp_0; + tmp_0 = (register_0 >> 5) & ((1ULL << 21) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 17)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 18)) = tmp_0; + tmp_0 = (register_0 >> 15) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 17; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 19)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 21) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 20)) = tmp_0; + tmp_0 = (register_0 >> 25) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 7; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 21)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 18; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 22)) = tmp_0; + tmp_0 = (register_0 >> 3) & ((1ULL << 21) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 23)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 24)) = tmp_0; + tmp_0 = (register_0 >> 13) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 19; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 25)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 21) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 26)) = tmp_0; + tmp_0 = (register_0 >> 23) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 9; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 27)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 28)) = tmp_0; + tmp_0 = (register_0 >> 1) & ((1ULL << 21) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 29)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 30)) = tmp_0; + tmp_0 = (register_0 >> 11) & ((1ULL << 21) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 31)) = tmp_0; + } +} +static void unffor_22bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, + uint32_t* __restrict a_out_p, + const uint32_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint32_t register_0; + [[maybe_unused]] uint32_t tmp_0; + [[maybe_unused]] uint32_t base_0 = *(a_base_p); + for (int i = 0; i < 32; ++i) { + register_0 = *(in + (0 * 32) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 22) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 0)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 1)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 2)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 22) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 3)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 4)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 18; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 5)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 22) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 6)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 7)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 8)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 22) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 9)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 10)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 14; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 11)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 22) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 12)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 13)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 14)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 22) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 15)) = tmp_0; + register_0 = *(in + (0 * 32) + (i * 1) + 352); + tmp_0 = (register_0) & ((1ULL << 22) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 16)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 17)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 18)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 22) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 19)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 20)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 18; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 21)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 22) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 22)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 23)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 24)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 22) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 25)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 26)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 14; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 27)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 22) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 28)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 29)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 30)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 22) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 31)) = tmp_0; + } +} +static void unffor_23bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, + uint32_t* __restrict a_out_p, + const uint32_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint32_t register_0; + [[maybe_unused]] uint32_t tmp_0; + [[maybe_unused]] uint32_t base_0 = *(a_base_p); + for (int i = 0; i < 32; ++i) { + register_0 = *(in + (0 * 32) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 23) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 0)) = tmp_0; + tmp_0 = (register_0 >> 23) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 9; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 1)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 18; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 2)) = tmp_0; + tmp_0 = (register_0 >> 5) & ((1ULL << 23) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 3)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 4)) = tmp_0; + tmp_0 = (register_0 >> 19) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 13; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 5)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 22; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 6)) = tmp_0; + tmp_0 = (register_0 >> 1) & ((1ULL << 23) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 7)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 8)) = tmp_0; + tmp_0 = (register_0 >> 15) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 17; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 9)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 23) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 10)) = tmp_0; + tmp_0 = (register_0 >> 29) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 3; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 11)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 12)) = tmp_0; + tmp_0 = (register_0 >> 11) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 21; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 13)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 23) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 14)) = tmp_0; + tmp_0 = (register_0 >> 25) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 7; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 15)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 16)) = tmp_0; + tmp_0 = (register_0 >> 7) & ((1ULL << 23) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 17)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 18)) = tmp_0; + tmp_0 = (register_0 >> 21) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 11; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 19)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 20)) = tmp_0; + tmp_0 = (register_0 >> 3) & ((1ULL << 23) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 21)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 22)) = tmp_0; + tmp_0 = (register_0 >> 17) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 15; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 23)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 23) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 24)) = tmp_0; + tmp_0 = (register_0 >> 31) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 1; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 25)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 26)) = tmp_0; + tmp_0 = (register_0 >> 13) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 19; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 27)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 23) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 28)) = tmp_0; + tmp_0 = (register_0 >> 27) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 5; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 29)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 14; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 30)) = tmp_0; + tmp_0 = (register_0 >> 9) & ((1ULL << 23) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 31)) = tmp_0; + } +} +static void unffor_24bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, + uint32_t* __restrict a_out_p, + const uint32_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint32_t register_0; + [[maybe_unused]] uint32_t tmp_0; + [[maybe_unused]] uint32_t base_0 = *(a_base_p); + for (int i = 0; i < 32; ++i) { + register_0 = *(in + (0 * 32) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 24) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 0)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 1)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 2)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 24) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 3)) = tmp_0; + register_0 = *(in + (0 * 32) + (i * 1) + 96); + tmp_0 = (register_0) & ((1ULL << 24) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 4)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 5)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 6)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 24) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 7)) = tmp_0; + register_0 = *(in + (0 * 32) + (i * 1) + 192); + tmp_0 = (register_0) & ((1ULL << 24) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 8)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 9)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 10)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 24) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 11)) = tmp_0; + register_0 = *(in + (0 * 32) + (i * 1) + 288); + tmp_0 = (register_0) & ((1ULL << 24) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 12)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 13)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 14)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 24) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 15)) = tmp_0; + register_0 = *(in + (0 * 32) + (i * 1) + 384); + tmp_0 = (register_0) & ((1ULL << 24) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 16)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 17)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 18)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 24) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 19)) = tmp_0; + register_0 = *(in + (0 * 32) + (i * 1) + 480); + tmp_0 = (register_0) & ((1ULL << 24) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 20)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 21)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 22)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 24) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 23)) = tmp_0; + register_0 = *(in + (0 * 32) + (i * 1) + 576); + tmp_0 = (register_0) & ((1ULL << 24) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 24)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 25)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 26)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 24) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 27)) = tmp_0; + register_0 = *(in + (0 * 32) + (i * 1) + 672); + tmp_0 = (register_0) & ((1ULL << 24) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 28)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 29)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 30)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 24) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 31)) = tmp_0; + } +} +static void unffor_25bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, + uint32_t* __restrict a_out_p, + const uint32_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint32_t register_0; + [[maybe_unused]] uint32_t tmp_0; + [[maybe_unused]] uint32_t base_0 = *(a_base_p); + for (int i = 0; i < 32; ++i) { + register_0 = *(in + (0 * 32) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 25) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 0)) = tmp_0; + tmp_0 = (register_0 >> 25) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 7; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 1)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 14; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 2)) = tmp_0; + tmp_0 = (register_0 >> 11) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 21; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 3)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 25) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 4)) = tmp_0; + tmp_0 = (register_0 >> 29) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 3; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 5)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 6)) = tmp_0; + tmp_0 = (register_0 >> 15) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 17; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 7)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 8)) = tmp_0; + tmp_0 = (register_0 >> 1) & ((1ULL << 25) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 9)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 10)) = tmp_0; + tmp_0 = (register_0 >> 19) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 13; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 11)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 12)) = tmp_0; + tmp_0 = (register_0 >> 5) & ((1ULL << 25) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 13)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 14)) = tmp_0; + tmp_0 = (register_0 >> 23) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 9; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 15)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 16)) = tmp_0; + tmp_0 = (register_0 >> 9) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 23; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 17)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 25) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 18)) = tmp_0; + tmp_0 = (register_0 >> 27) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 5; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 19)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 20)) = tmp_0; + tmp_0 = (register_0 >> 13) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 19; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 21)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 25) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 22)) = tmp_0; + tmp_0 = (register_0 >> 31) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 1; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 23)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 24)) = tmp_0; + tmp_0 = (register_0 >> 17) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 15; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 25)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 22; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 26)) = tmp_0; + tmp_0 = (register_0 >> 3) & ((1ULL << 25) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 27)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 28)) = tmp_0; + tmp_0 = (register_0 >> 21) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 11; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 29)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 768); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 18; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 30)) = tmp_0; + tmp_0 = (register_0 >> 7) & ((1ULL << 25) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 31)) = tmp_0; + } +} +static void unffor_26bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, + uint32_t* __restrict a_out_p, + const uint32_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint32_t register_0; + [[maybe_unused]] uint32_t tmp_0; + [[maybe_unused]] uint32_t base_0 = *(a_base_p); + for (int i = 0; i < 32; ++i) { + register_0 = *(in + (0 * 32) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 26) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 0)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 1)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 2)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 18; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 3)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 4)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 26) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 5)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 6)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 7)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 8)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 22; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 9)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 26) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 10)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 11)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 12)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 14; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 13)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 14)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 26) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 15)) = tmp_0; + register_0 = *(in + (0 * 32) + (i * 1) + 416); + tmp_0 = (register_0) & ((1ULL << 26) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 16)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 17)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 18)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 18; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 19)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 20)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 26) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 21)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 22)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 23)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 24)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 22; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 25)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 26) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 26)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 27)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 28)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 768); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 14; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 29)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 800); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 30)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 26) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 31)) = tmp_0; + } +} +static void unffor_27bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, + uint32_t* __restrict a_out_p, + const uint32_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint32_t register_0; + [[maybe_unused]] uint32_t tmp_0; + [[maybe_unused]] uint32_t base_0 = *(a_base_p); + for (int i = 0; i < 32; ++i) { + register_0 = *(in + (0 * 32) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 27) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 0)) = tmp_0; + tmp_0 = (register_0 >> 27) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 5; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 1)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 2)) = tmp_0; + tmp_0 = (register_0 >> 17) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 15; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 3)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 4)) = tmp_0; + tmp_0 = (register_0 >> 7) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 25; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 5)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 27) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 6)) = tmp_0; + tmp_0 = (register_0 >> 29) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 3; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 7)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 8)) = tmp_0; + tmp_0 = (register_0 >> 19) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 13; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 9)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 18; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 10)) = tmp_0; + tmp_0 = (register_0 >> 9) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 23; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 11)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 27) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 12)) = tmp_0; + tmp_0 = (register_0 >> 31) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 1; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 13)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 14)) = tmp_0; + tmp_0 = (register_0 >> 21) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 11; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 15)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 16)) = tmp_0; + tmp_0 = (register_0 >> 11) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 21; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 17)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 26; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 18)) = tmp_0; + tmp_0 = (register_0 >> 1) & ((1ULL << 27) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 19)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 20)) = tmp_0; + tmp_0 = (register_0 >> 23) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 9; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 21)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 14; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 22)) = tmp_0; + tmp_0 = (register_0 >> 13) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 19; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 23)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 24)) = tmp_0; + tmp_0 = (register_0 >> 3) & ((1ULL << 27) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 25)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 26)) = tmp_0; + tmp_0 = (register_0 >> 25) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 7; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 27)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 768); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 28)) = tmp_0; + tmp_0 = (register_0 >> 15) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 800); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 17; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 29)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 832); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 22; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 30)) = tmp_0; + tmp_0 = (register_0 >> 5) & ((1ULL << 27) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 31)) = tmp_0; + } +} +static void unffor_28bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, + uint32_t* __restrict a_out_p, + const uint32_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint32_t register_0; + [[maybe_unused]] uint32_t tmp_0; + [[maybe_unused]] uint32_t base_0 = *(a_base_p); + for (int i = 0; i < 32; ++i) { + register_0 = *(in + (0 * 32) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 28) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 0)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 1)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 2)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 3)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 4)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 5)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 6)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 28) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 7)) = tmp_0; + register_0 = *(in + (0 * 32) + (i * 1) + 224); + tmp_0 = (register_0) & ((1ULL << 28) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 8)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 9)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 10)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 11)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 12)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 13)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 14)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 28) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 15)) = tmp_0; + register_0 = *(in + (0 * 32) + (i * 1) + 448); + tmp_0 = (register_0) & ((1ULL << 28) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 16)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 17)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 18)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 19)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 20)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 21)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 22)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 28) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 23)) = tmp_0; + register_0 = *(in + (0 * 32) + (i * 1) + 672); + tmp_0 = (register_0) & ((1ULL << 28) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 24)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 25)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 26)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 768); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 27)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 800); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 28)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 832); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 29)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 864); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 30)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 28) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 31)) = tmp_0; + } +} +static void unffor_29bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, + uint32_t* __restrict a_out_p, + const uint32_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint32_t register_0; + [[maybe_unused]] uint32_t tmp_0; + [[maybe_unused]] uint32_t base_0 = *(a_base_p); + for (int i = 0; i < 32; ++i) { + register_0 = *(in + (0 * 32) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 29) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 0)) = tmp_0; + tmp_0 = (register_0 >> 29) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 3; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 1)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 2)) = tmp_0; + tmp_0 = (register_0 >> 23) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 9; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 3)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 4)) = tmp_0; + tmp_0 = (register_0 >> 17) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 15; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 5)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 18; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 6)) = tmp_0; + tmp_0 = (register_0 >> 11) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 21; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 7)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 8)) = tmp_0; + tmp_0 = (register_0 >> 5) & ((1ULL << 27) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 27; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 9)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 29) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 10)) = tmp_0; + tmp_0 = (register_0 >> 31) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 1; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 11)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 12)) = tmp_0; + tmp_0 = (register_0 >> 25) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 7; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 13)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 14)) = tmp_0; + tmp_0 = (register_0 >> 19) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 13; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 15)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 16)) = tmp_0; + tmp_0 = (register_0 >> 13) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 19; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 17)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 22; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 18)) = tmp_0; + tmp_0 = (register_0 >> 7) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 25; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 19)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 28; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 20)) = tmp_0; + tmp_0 = (register_0 >> 1) & ((1ULL << 29) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 21)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 22)) = tmp_0; + tmp_0 = (register_0 >> 27) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 5; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 23)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 24)) = tmp_0; + tmp_0 = (register_0 >> 21) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 11; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 25)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 768); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 14; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 26)) = tmp_0; + tmp_0 = (register_0 >> 15) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 800); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 17; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 27)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 832); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 28)) = tmp_0; + tmp_0 = (register_0 >> 9) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 864); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 23; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 29)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 896); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 26; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 30)) = tmp_0; + tmp_0 = (register_0 >> 3) & ((1ULL << 29) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 31)) = tmp_0; + } +} +static void unffor_30bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, + uint32_t* __restrict a_out_p, + const uint32_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint32_t register_0; + [[maybe_unused]] uint32_t tmp_0; + [[maybe_unused]] uint32_t base_0 = *(a_base_p); + for (int i = 0; i < 32; ++i) { + register_0 = *(in + (0 * 32) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 30) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 0)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 1)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 2)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 3)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 4)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 5)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 6)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 14; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 7)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 8)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 18; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 9)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 10)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 22; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 11)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 12)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 26; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 13)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 28; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 14)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 30) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 15)) = tmp_0; + register_0 = *(in + (0 * 32) + (i * 1) + 480); + tmp_0 = (register_0) & ((1ULL << 30) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 16)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 17)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 18)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 19)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 20)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 21)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 22)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 14; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 23)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 24)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 768); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 18; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 25)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 800); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 26)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 832); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 22; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 27)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 864); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 28)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 896); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 26; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 29)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 928); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 28; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 30)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 30) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 31)) = tmp_0; + } +} +static void unffor_31bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, + uint32_t* __restrict a_out_p, + const uint32_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint32_t register_0; + [[maybe_unused]] uint32_t tmp_0; + [[maybe_unused]] uint32_t base_0 = *(a_base_p); + for (int i = 0; i < 32; ++i) { + register_0 = *(in + (0 * 32) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 31) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 0)) = tmp_0; + tmp_0 = (register_0 >> 31) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 1; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 1)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 29) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 2)) = tmp_0; + tmp_0 = (register_0 >> 29) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 3; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 3)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 4)) = tmp_0; + tmp_0 = (register_0 >> 27) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 5; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 5)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 6)) = tmp_0; + tmp_0 = (register_0 >> 25) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 7; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 7)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 8)) = tmp_0; + tmp_0 = (register_0 >> 23) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 9; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 9)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 10)) = tmp_0; + tmp_0 = (register_0 >> 21) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 11; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 11)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 12)) = tmp_0; + tmp_0 = (register_0 >> 19) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 13; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 13)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 14; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 14)) = tmp_0; + tmp_0 = (register_0 >> 17) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 15; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 15)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 16)) = tmp_0; + tmp_0 = (register_0 >> 15) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 17; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 17)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 18; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 18)) = tmp_0; + tmp_0 = (register_0 >> 13) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 19; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 19)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 20)) = tmp_0; + tmp_0 = (register_0 >> 11) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 21; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 21)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 22; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 22)) = tmp_0; + tmp_0 = (register_0 >> 9) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 23; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 23)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 768); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 24)) = tmp_0; + tmp_0 = (register_0 >> 7) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 800); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 25; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 25)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 832); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 26; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 26)) = tmp_0; + tmp_0 = (register_0 >> 5) & ((1ULL << 27) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 864); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 27; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 27)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 896); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 28; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 28)) = tmp_0; + tmp_0 = (register_0 >> 3) & ((1ULL << 29) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 928); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 29; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 29)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 32) + (i * 1) + 960); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 30; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 30)) = tmp_0; + tmp_0 = (register_0 >> 1) & ((1ULL << 31) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 31)) = tmp_0; + } +} +static void unffor_32bw_32ow_32crw_1uf(const uint32_t* __restrict a_in_p, + uint32_t* __restrict a_out_p, + const uint32_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint32_t register_0; + [[maybe_unused]] uint32_t tmp_0; + [[maybe_unused]] uint32_t base_0 = *(a_base_p); + for (int i = 0; i < 32; ++i) { + register_0 = *(in + (0 * 32) + (i * 1) + 0); + register_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 0)) = register_0; + register_0 = *(in + (0 * 32) + (i * 1) + 32); + register_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 1)) = register_0; + register_0 = *(in + (0 * 32) + (i * 1) + 64); + register_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 2)) = register_0; + register_0 = *(in + (0 * 32) + (i * 1) + 96); + register_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 3)) = register_0; + register_0 = *(in + (0 * 32) + (i * 1) + 128); + register_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 4)) = register_0; + register_0 = *(in + (0 * 32) + (i * 1) + 160); + register_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 5)) = register_0; + register_0 = *(in + (0 * 32) + (i * 1) + 192); + register_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 6)) = register_0; + register_0 = *(in + (0 * 32) + (i * 1) + 224); + register_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 7)) = register_0; + register_0 = *(in + (0 * 32) + (i * 1) + 256); + register_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 8)) = register_0; + register_0 = *(in + (0 * 32) + (i * 1) + 288); + register_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 9)) = register_0; + register_0 = *(in + (0 * 32) + (i * 1) + 320); + register_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 10)) = register_0; + register_0 = *(in + (0 * 32) + (i * 1) + 352); + register_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 11)) = register_0; + register_0 = *(in + (0 * 32) + (i * 1) + 384); + register_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 12)) = register_0; + register_0 = *(in + (0 * 32) + (i * 1) + 416); + register_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 13)) = register_0; + register_0 = *(in + (0 * 32) + (i * 1) + 448); + register_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 14)) = register_0; + register_0 = *(in + (0 * 32) + (i * 1) + 480); + register_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 15)) = register_0; + register_0 = *(in + (0 * 32) + (i * 1) + 512); + register_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 16)) = register_0; + register_0 = *(in + (0 * 32) + (i * 1) + 544); + register_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 17)) = register_0; + register_0 = *(in + (0 * 32) + (i * 1) + 576); + register_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 18)) = register_0; + register_0 = *(in + (0 * 32) + (i * 1) + 608); + register_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 19)) = register_0; + register_0 = *(in + (0 * 32) + (i * 1) + 640); + register_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 20)) = register_0; + register_0 = *(in + (0 * 32) + (i * 1) + 672); + register_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 21)) = register_0; + register_0 = *(in + (0 * 32) + (i * 1) + 704); + register_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 22)) = register_0; + register_0 = *(in + (0 * 32) + (i * 1) + 736); + register_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 23)) = register_0; + register_0 = *(in + (0 * 32) + (i * 1) + 768); + register_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 24)) = register_0; + register_0 = *(in + (0 * 32) + (i * 1) + 800); + register_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 25)) = register_0; + register_0 = *(in + (0 * 32) + (i * 1) + 832); + register_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 26)) = register_0; + register_0 = *(in + (0 * 32) + (i * 1) + 864); + register_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 27)) = register_0; + register_0 = *(in + (0 * 32) + (i * 1) + 896); + register_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 28)) = register_0; + register_0 = *(in + (0 * 32) + (i * 1) + 928); + register_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 29)) = register_0; + register_0 = *(in + (0 * 32) + (i * 1) + 960); + register_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 30)) = register_0; + register_0 = *(in + (0 * 32) + (i * 1) + 992); + register_0 += base_0; + *(out + (i * 1) + (0 * 32) + (32 * 31)) = register_0; + } +} +static void unffor_0bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + uint64_t* __restrict a_out_p, + const uint64_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + for (int i = 0; i < 16; ++i) { + *(out + (i * 1) + (0 * 16) + (16 * 0)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = base_0; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = base_0; + } +} +static void unffor_1bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + uint64_t* __restrict a_out_p, + const uint64_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_0; + tmp_0 = (register_0 >> 1) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_0; + tmp_0 = (register_0 >> 3) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_0; + tmp_0 = (register_0 >> 5) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_0; + tmp_0 = (register_0 >> 7) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_0; + tmp_0 = (register_0 >> 9) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_0; + tmp_0 = (register_0 >> 11) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_0; + tmp_0 = (register_0 >> 13) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_0; + tmp_0 = (register_0 >> 15) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_0; + tmp_0 = (register_0 >> 17) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_0; + tmp_0 = (register_0 >> 19) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_0; + tmp_0 = (register_0 >> 21) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_0; + tmp_0 = (register_0 >> 23) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_0; + tmp_0 = (register_0 >> 25) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_0; + tmp_0 = (register_0 >> 27) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_0; + tmp_0 = (register_0 >> 29) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_0; + tmp_0 = (register_0 >> 31) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_0; + tmp_0 = (register_0 >> 33) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_0; + tmp_0 = (register_0 >> 34) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_0; + tmp_0 = (register_0 >> 35) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_0; + tmp_0 = (register_0 >> 37) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_0; + tmp_0 = (register_0 >> 38) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_0; + tmp_0 = (register_0 >> 39) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_0; + tmp_0 = (register_0 >> 41) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_0; + tmp_0 = (register_0 >> 42) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_0; + tmp_0 = (register_0 >> 43) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_0; + tmp_0 = (register_0 >> 45) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_0; + tmp_0 = (register_0 >> 46) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_0; + tmp_0 = (register_0 >> 47) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_0; + tmp_0 = (register_0 >> 49) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_0; + tmp_0 = (register_0 >> 50) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_0; + tmp_0 = (register_0 >> 51) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_0; + tmp_0 = (register_0 >> 53) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_0; + tmp_0 = (register_0 >> 54) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_0; + tmp_0 = (register_0 >> 55) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_0; + tmp_0 = (register_0 >> 57) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_0; + tmp_0 = (register_0 >> 58) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_0; + tmp_0 = (register_0 >> 59) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_0; + tmp_0 = (register_0 >> 61) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_0; + tmp_0 = (register_0 >> 62) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_0; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_0; + } +} +static void unffor_2bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + uint64_t* __restrict a_out_p, + const uint64_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_0; + tmp_0 = (register_0 >> 34) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_0; + tmp_0 = (register_0 >> 38) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_0; + tmp_0 = (register_0 >> 42) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_0; + tmp_0 = (register_0 >> 46) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_0; + tmp_0 = (register_0 >> 50) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_0; + tmp_0 = (register_0 >> 54) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_0; + tmp_0 = (register_0 >> 58) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_0; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 = (register_0) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_0; + tmp_0 = (register_0 >> 34) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_0; + tmp_0 = (register_0 >> 38) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_0; + tmp_0 = (register_0 >> 42) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_0; + tmp_0 = (register_0 >> 46) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_0; + tmp_0 = (register_0 >> 50) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_0; + tmp_0 = (register_0 >> 54) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_0; + tmp_0 = (register_0 >> 58) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_0; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_0; + } +} +static void unffor_3bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + uint64_t* __restrict a_out_p, + const uint64_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_0; + tmp_0 = (register_0 >> 3) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_0; + tmp_0 = (register_0 >> 9) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_0; + tmp_0 = (register_0 >> 15) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_0; + tmp_0 = (register_0 >> 21) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_0; + tmp_0 = (register_0 >> 27) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_0; + tmp_0 = (register_0 >> 33) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_0; + tmp_0 = (register_0 >> 39) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_0; + tmp_0 = (register_0 >> 42) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_0; + tmp_0 = (register_0 >> 45) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_0; + tmp_0 = (register_0 >> 51) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_0; + tmp_0 = (register_0 >> 54) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_0; + tmp_0 = (register_0 >> 57) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_0; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 1; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_0; + tmp_0 = (register_0 >> 5) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_0; + tmp_0 = (register_0 >> 11) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_0; + tmp_0 = (register_0 >> 17) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_0; + tmp_0 = (register_0 >> 23) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_0; + tmp_0 = (register_0 >> 29) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_0; + tmp_0 = (register_0 >> 35) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_0; + tmp_0 = (register_0 >> 38) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_0; + tmp_0 = (register_0 >> 41) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_0; + tmp_0 = (register_0 >> 47) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_0; + tmp_0 = (register_0 >> 50) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_0; + tmp_0 = (register_0 >> 53) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_0; + tmp_0 = (register_0 >> 59) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_0; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_0; + tmp_0 = (register_0 >> 1) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_0; + tmp_0 = (register_0 >> 7) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_0; + tmp_0 = (register_0 >> 13) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_0; + tmp_0 = (register_0 >> 19) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_0; + tmp_0 = (register_0 >> 25) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_0; + tmp_0 = (register_0 >> 31) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_0; + tmp_0 = (register_0 >> 34) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_0; + tmp_0 = (register_0 >> 37) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_0; + tmp_0 = (register_0 >> 43) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_0; + tmp_0 = (register_0 >> 46) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_0; + tmp_0 = (register_0 >> 49) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_0; + tmp_0 = (register_0 >> 55) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_0; + tmp_0 = (register_0 >> 58) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_0; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_0; + } +} +static void unffor_4bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + uint64_t* __restrict a_out_p, + const uint64_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 = (register_0) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 = (register_0) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 = (register_0) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_0; + } +} +static void unffor_5bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + uint64_t* __restrict a_out_p, + const uint64_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_0; + tmp_0 = (register_0 >> 5) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_0; + tmp_0 = (register_0 >> 15) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_0; + tmp_0 = (register_0 >> 25) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_0; + tmp_0 = (register_0 >> 35) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_0; + tmp_0 = (register_0 >> 45) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_0; + tmp_0 = (register_0 >> 50) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_0; + tmp_0 = (register_0 >> 55) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_0; + tmp_0 = (register_0 >> 1) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_0; + tmp_0 = (register_0 >> 11) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_0; + tmp_0 = (register_0 >> 21) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_0; + tmp_0 = (register_0 >> 31) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_0; + tmp_0 = (register_0 >> 41) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_0; + tmp_0 = (register_0 >> 46) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_0; + tmp_0 = (register_0 >> 51) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_0; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 3; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_0; + tmp_0 = (register_0 >> 7) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_0; + tmp_0 = (register_0 >> 17) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_0; + tmp_0 = (register_0 >> 27) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_0; + tmp_0 = (register_0 >> 37) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_0; + tmp_0 = (register_0 >> 42) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_0; + tmp_0 = (register_0 >> 47) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_0; + tmp_0 = (register_0 >> 57) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_0; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_0; + tmp_0 = (register_0 >> 3) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_0; + tmp_0 = (register_0 >> 13) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_0; + tmp_0 = (register_0 >> 23) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_0; + tmp_0 = (register_0 >> 33) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_0; + tmp_0 = (register_0 >> 38) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_0; + tmp_0 = (register_0 >> 43) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_0; + tmp_0 = (register_0 >> 53) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_0; + tmp_0 = (register_0 >> 58) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_0; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 1; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_0; + tmp_0 = (register_0 >> 9) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_0; + tmp_0 = (register_0 >> 19) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_0; + tmp_0 = (register_0 >> 29) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_0; + tmp_0 = (register_0 >> 34) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_0; + tmp_0 = (register_0 >> 39) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_0; + tmp_0 = (register_0 >> 49) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_0; + tmp_0 = (register_0 >> 54) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_0; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_0; + } +} +static void unffor_6bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + uint64_t* __restrict a_out_p, + const uint64_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_0; + tmp_0 = (register_0 >> 42) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_0; + tmp_0 = (register_0 >> 54) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_0; + tmp_0 = (register_0 >> 38) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_0; + tmp_0 = (register_0 >> 50) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_0; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_0; + tmp_0 = (register_0 >> 34) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_0; + tmp_0 = (register_0 >> 46) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_0; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 = (register_0) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_0; + tmp_0 = (register_0 >> 42) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_0; + tmp_0 = (register_0 >> 54) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_0; + tmp_0 = (register_0 >> 38) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_0; + tmp_0 = (register_0 >> 50) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_0; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_0; + tmp_0 = (register_0 >> 34) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_0; + tmp_0 = (register_0 >> 46) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_0; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_0; + } +} +static void unffor_7bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + uint64_t* __restrict a_out_p, + const uint64_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_0; + tmp_0 = (register_0 >> 7) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_0; + tmp_0 = (register_0 >> 21) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_0; + tmp_0 = (register_0 >> 35) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_0; + tmp_0 = (register_0 >> 42) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_0; + tmp_0 = (register_0 >> 49) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_0; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 1; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_0; + tmp_0 = (register_0 >> 13) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_0; + tmp_0 = (register_0 >> 27) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_0; + tmp_0 = (register_0 >> 34) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_0; + tmp_0 = (register_0 >> 41) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_0; + tmp_0 = (register_0 >> 55) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_0; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_0; + tmp_0 = (register_0 >> 5) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_0; + tmp_0 = (register_0 >> 19) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_0; + tmp_0 = (register_0 >> 33) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_0; + tmp_0 = (register_0 >> 47) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_0; + tmp_0 = (register_0 >> 54) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_0; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 3; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_0; + tmp_0 = (register_0 >> 11) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_0; + tmp_0 = (register_0 >> 25) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_0; + tmp_0 = (register_0 >> 39) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_0; + tmp_0 = (register_0 >> 46) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_0; + tmp_0 = (register_0 >> 53) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_0; + tmp_0 = (register_0 >> 3) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_0; + tmp_0 = (register_0 >> 17) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_0; + tmp_0 = (register_0 >> 31) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_0; + tmp_0 = (register_0 >> 38) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_0; + tmp_0 = (register_0 >> 45) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_0; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 5; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_0; + tmp_0 = (register_0 >> 9) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_0; + tmp_0 = (register_0 >> 23) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_0; + tmp_0 = (register_0 >> 37) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_0; + tmp_0 = (register_0 >> 51) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_0; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_0; + tmp_0 = (register_0 >> 1) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_0; + tmp_0 = (register_0 >> 15) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_0; + tmp_0 = (register_0 >> 29) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_0; + tmp_0 = (register_0 >> 43) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_0; + tmp_0 = (register_0 >> 50) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_0; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_0; + } +} +static void unffor_8bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + uint64_t* __restrict a_out_p, + const uint64_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 = (register_0) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 = (register_0) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 = (register_0) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 = (register_0) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 = (register_0) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 = (register_0) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 = (register_0) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_0; + } +} +static void unffor_9bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + uint64_t* __restrict a_out_p, + const uint64_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_0; + tmp_0 = (register_0 >> 9) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_0; + tmp_0 = (register_0 >> 27) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_0; + tmp_0 = (register_0 >> 45) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_0; + tmp_0 = (register_0 >> 54) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_0; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 1; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_0; + tmp_0 = (register_0 >> 17) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_0; + tmp_0 = (register_0 >> 35) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_0; + tmp_0 = (register_0 >> 53) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_0; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_0; + tmp_0 = (register_0 >> 7) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_0; + tmp_0 = (register_0 >> 25) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_0; + tmp_0 = (register_0 >> 34) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_0; + tmp_0 = (register_0 >> 43) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_0; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 3; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_0; + tmp_0 = (register_0 >> 15) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_0; + tmp_0 = (register_0 >> 33) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_0; + tmp_0 = (register_0 >> 42) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_0; + tmp_0 = (register_0 >> 51) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_0; + tmp_0 = (register_0 >> 5) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_0; + tmp_0 = (register_0 >> 23) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_0; + tmp_0 = (register_0 >> 41) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_0; + tmp_0 = (register_0 >> 50) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_0; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 5; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_0; + tmp_0 = (register_0 >> 13) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_0; + tmp_0 = (register_0 >> 31) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_0; + tmp_0 = (register_0 >> 49) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_0; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_0; + tmp_0 = (register_0 >> 3) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_0; + tmp_0 = (register_0 >> 21) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_0; + tmp_0 = (register_0 >> 39) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_0; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 7; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_0; + tmp_0 = (register_0 >> 11) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_0; + tmp_0 = (register_0 >> 29) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_0; + tmp_0 = (register_0 >> 38) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_0; + tmp_0 = (register_0 >> 47) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_0; + tmp_0 = (register_0 >> 1) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_0; + tmp_0 = (register_0 >> 19) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_0; + tmp_0 = (register_0 >> 37) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_0; + tmp_0 = (register_0 >> 46) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_0; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_0; + } +} +static void unffor_10bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + uint64_t* __restrict a_out_p, + const uint64_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_0; + tmp_0 = (register_0 >> 50) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_0; + tmp_0 = (register_0 >> 46) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_0; + tmp_0 = (register_0 >> 42) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_0; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_0; + tmp_0 = (register_0 >> 38) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_0; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_0; + tmp_0 = (register_0 >> 34) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_0; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 = (register_0) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_0; + tmp_0 = (register_0 >> 50) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_0; + tmp_0 = (register_0 >> 46) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_0; + tmp_0 = (register_0 >> 42) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_0; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_0; + tmp_0 = (register_0 >> 38) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_0; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_0; + tmp_0 = (register_0 >> 34) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_0; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_0; + } +} +static void unffor_11bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + uint64_t* __restrict a_out_p, + const uint64_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_0; + tmp_0 = (register_0 >> 11) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_0; + tmp_0 = (register_0 >> 33) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_0; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 9; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_0; + tmp_0 = (register_0 >> 13) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_0; + tmp_0 = (register_0 >> 35) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_0; + tmp_0 = (register_0 >> 46) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_0; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 7; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_0; + tmp_0 = (register_0 >> 15) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_0; + tmp_0 = (register_0 >> 37) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_0; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 5; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_0; + tmp_0 = (register_0 >> 17) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_0; + tmp_0 = (register_0 >> 39) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_0; + tmp_0 = (register_0 >> 50) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_0; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 3; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_0; + tmp_0 = (register_0 >> 19) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_0; + tmp_0 = (register_0 >> 41) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_0; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 1; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_0; + tmp_0 = (register_0 >> 21) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_0; + tmp_0 = (register_0 >> 43) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_0; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_0; + tmp_0 = (register_0 >> 1) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_0; + tmp_0 = (register_0 >> 23) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_0; + tmp_0 = (register_0 >> 34) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_0; + tmp_0 = (register_0 >> 45) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_0; + tmp_0 = (register_0 >> 3) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_0; + tmp_0 = (register_0 >> 25) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_0; + tmp_0 = (register_0 >> 47) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_0; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_0; + tmp_0 = (register_0 >> 5) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_0; + tmp_0 = (register_0 >> 27) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_0; + tmp_0 = (register_0 >> 38) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_0; + tmp_0 = (register_0 >> 49) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_0; + tmp_0 = (register_0 >> 7) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_0; + tmp_0 = (register_0 >> 29) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_0; + tmp_0 = (register_0 >> 51) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_0; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_0; + tmp_0 = (register_0 >> 9) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_0; + tmp_0 = (register_0 >> 31) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_0; + tmp_0 = (register_0 >> 42) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_0; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_0; + } +} +static void unffor_12bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + uint64_t* __restrict a_out_p, + const uint64_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 = (register_0) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 = (register_0) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 = (register_0) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_0; + } +} +static void unffor_13bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + uint64_t* __restrict a_out_p, + const uint64_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_0; + tmp_0 = (register_0 >> 13) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_0; + tmp_0 = (register_0 >> 39) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_0; + tmp_0 = (register_0 >> 1) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_0; + tmp_0 = (register_0 >> 27) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_0; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 11; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_0; + tmp_0 = (register_0 >> 15) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_0; + tmp_0 = (register_0 >> 41) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_0; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_0; + tmp_0 = (register_0 >> 3) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_0; + tmp_0 = (register_0 >> 29) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_0; + tmp_0 = (register_0 >> 42) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_0; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 9; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_0; + tmp_0 = (register_0 >> 17) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_0; + tmp_0 = (register_0 >> 43) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_0; + tmp_0 = (register_0 >> 5) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_0; + tmp_0 = (register_0 >> 31) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_0; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 7; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_0; + tmp_0 = (register_0 >> 19) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_0; + tmp_0 = (register_0 >> 45) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_0; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_0; + tmp_0 = (register_0 >> 7) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_0; + tmp_0 = (register_0 >> 33) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_0; + tmp_0 = (register_0 >> 46) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_0; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 5; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_0; + tmp_0 = (register_0 >> 21) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_0; + tmp_0 = (register_0 >> 34) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_0; + tmp_0 = (register_0 >> 47) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_0; + tmp_0 = (register_0 >> 9) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_0; + tmp_0 = (register_0 >> 35) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_0; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 3; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_0; + tmp_0 = (register_0 >> 23) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_0; + tmp_0 = (register_0 >> 49) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_0; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_0; + tmp_0 = (register_0 >> 11) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_0; + tmp_0 = (register_0 >> 37) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_0; + tmp_0 = (register_0 >> 50) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_0; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 1; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_0; + tmp_0 = (register_0 >> 25) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_0; + tmp_0 = (register_0 >> 38) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_0; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_0; + } +} +static void unffor_14bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + uint64_t* __restrict a_out_p, + const uint64_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_0; + tmp_0 = (register_0 >> 42) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_0; + tmp_0 = (register_0 >> 34) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_0; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_0; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_0; + tmp_0 = (register_0 >> 46) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_0; + tmp_0 = (register_0 >> 38) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_0; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_0; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 = (register_0) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_0; + tmp_0 = (register_0 >> 42) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_0; + tmp_0 = (register_0 >> 34) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_0; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_0; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_0; + tmp_0 = (register_0 >> 46) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_0; + tmp_0 = (register_0 >> 38) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_0; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_0; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_0; + } +} +static void unffor_15bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + uint64_t* __restrict a_out_p, + const uint64_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 15) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_0; + tmp_0 = (register_0 >> 15) & ((1ULL << 15) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 15) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_0; + tmp_0 = (register_0 >> 45) & ((1ULL << 15) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_0; + tmp_0 = (register_0 >> 11) & ((1ULL << 15) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 15) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_0; + tmp_0 = (register_0 >> 41) & ((1ULL << 15) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_0; + tmp_0 = (register_0 >> 7) & ((1ULL << 15) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 15) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_0; + tmp_0 = (register_0 >> 37) & ((1ULL << 15) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_0; + tmp_0 = (register_0 >> 3) & ((1ULL << 15) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 15) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_0; + tmp_0 = (register_0 >> 33) & ((1ULL << 15) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 15) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_0; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 1; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 15) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_0; + tmp_0 = (register_0 >> 29) & ((1ULL << 15) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 15) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_0; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 5; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 15) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_0; + tmp_0 = (register_0 >> 25) & ((1ULL << 15) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 15) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_0; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 9; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 15) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_0; + tmp_0 = (register_0 >> 21) & ((1ULL << 15) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 15) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_0; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 13; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 15) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_0; + tmp_0 = (register_0 >> 17) & ((1ULL << 15) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 15) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_0; + tmp_0 = (register_0 >> 47) & ((1ULL << 15) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_0; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_0; + tmp_0 = (register_0 >> 13) & ((1ULL << 15) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 15) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_0; + tmp_0 = (register_0 >> 43) & ((1ULL << 15) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_0; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_0; + tmp_0 = (register_0 >> 9) & ((1ULL << 15) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 15) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_0; + tmp_0 = (register_0 >> 39) & ((1ULL << 15) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_0; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_0; + tmp_0 = (register_0 >> 5) & ((1ULL << 15) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 15) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_0; + tmp_0 = (register_0 >> 35) & ((1ULL << 15) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_0; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 14; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_0; + tmp_0 = (register_0 >> 1) & ((1ULL << 15) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 15) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_0; + tmp_0 = (register_0 >> 31) & ((1ULL << 15) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_0; + tmp_0 = (register_0 >> 46) & ((1ULL << 15) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_0; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 3; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 15) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_0; + tmp_0 = (register_0 >> 27) & ((1ULL << 15) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_0; + tmp_0 = (register_0 >> 42) & ((1ULL << 15) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_0; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 7; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 15) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_0; + tmp_0 = (register_0 >> 23) & ((1ULL << 15) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_0; + tmp_0 = (register_0 >> 38) & ((1ULL << 15) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_0; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 11; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 15) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_0; + tmp_0 = (register_0 >> 19) & ((1ULL << 15) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_0; + tmp_0 = (register_0 >> 34) & ((1ULL << 15) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_0; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_0; + } +} +static void unffor_16bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + uint64_t* __restrict a_out_p, + const uint64_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 = (register_0) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_0; + } +} +static void unffor_17bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + uint64_t* __restrict a_out_p, + const uint64_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 17) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_0; + tmp_0 = (register_0 >> 17) & ((1ULL << 17) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_0; + tmp_0 = (register_0 >> 34) & ((1ULL << 17) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_0; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 13; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 17) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_0; + tmp_0 = (register_0 >> 21) & ((1ULL << 17) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_0; + tmp_0 = (register_0 >> 38) & ((1ULL << 17) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_0; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 9; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 17) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_0; + tmp_0 = (register_0 >> 25) & ((1ULL << 17) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_0; + tmp_0 = (register_0 >> 42) & ((1ULL << 17) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_0; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 5; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 17) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_0; + tmp_0 = (register_0 >> 29) & ((1ULL << 17) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_0; + tmp_0 = (register_0 >> 46) & ((1ULL << 17) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_0; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 1; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 17) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_0; + tmp_0 = (register_0 >> 33) & ((1ULL << 17) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_0; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 14; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_0; + tmp_0 = (register_0 >> 3) & ((1ULL << 17) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 17) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_0; + tmp_0 = (register_0 >> 37) & ((1ULL << 17) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_0; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_0; + tmp_0 = (register_0 >> 7) & ((1ULL << 17) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 17) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_0; + tmp_0 = (register_0 >> 41) & ((1ULL << 17) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_0; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_0; + tmp_0 = (register_0 >> 11) & ((1ULL << 17) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 17) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_0; + tmp_0 = (register_0 >> 45) & ((1ULL << 17) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_0; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_0; + tmp_0 = (register_0 >> 15) & ((1ULL << 17) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 17) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_0; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 15; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 17) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_0; + tmp_0 = (register_0 >> 19) & ((1ULL << 17) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 17) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_0; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 11; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 17) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_0; + tmp_0 = (register_0 >> 23) & ((1ULL << 17) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 17) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_0; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 7; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 17) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_0; + tmp_0 = (register_0 >> 27) & ((1ULL << 17) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 17) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_0; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 3; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 17) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_0; + tmp_0 = (register_0 >> 31) & ((1ULL << 17) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_0; + tmp_0 = (register_0 >> 1) & ((1ULL << 17) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 17) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_0; + tmp_0 = (register_0 >> 35) & ((1ULL << 17) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_0; + tmp_0 = (register_0 >> 5) & ((1ULL << 17) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 17) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_0; + tmp_0 = (register_0 >> 39) & ((1ULL << 17) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_0; + tmp_0 = (register_0 >> 9) & ((1ULL << 17) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 17) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_0; + tmp_0 = (register_0 >> 43) & ((1ULL << 17) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_0; + tmp_0 = (register_0 >> 13) & ((1ULL << 17) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 17) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_0; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_0; + } +} +static void unffor_18bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + uint64_t* __restrict a_out_p, + const uint64_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 18) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 18) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 18) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_0; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 18) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 18) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 18) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_0; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 18) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_0; + tmp_0 = (register_0 >> 34) & ((1ULL << 18) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 18) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 18) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_0; + tmp_0 = (register_0 >> 42) & ((1ULL << 18) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 18) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 18) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_0; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 14; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 18) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 18) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 18) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_0; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 18) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 18) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 18) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 18) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_0; + tmp_0 = (register_0 >> 38) & ((1ULL << 18) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 18) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 18) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_0; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 = (register_0) & ((1ULL << 18) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 18) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 18) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_0; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 18) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 18) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 18) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_0; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 18) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_0; + tmp_0 = (register_0 >> 34) & ((1ULL << 18) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 18) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 18) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_0; + tmp_0 = (register_0 >> 42) & ((1ULL << 18) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 18) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 18) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_0; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 14; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 18) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 18) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 18) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_0; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 18) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 18) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 18) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 18) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_0; + tmp_0 = (register_0 >> 38) & ((1ULL << 18) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 18) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 18) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_0; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_0; + } +} +static void unffor_19bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + uint64_t* __restrict a_out_p, + const uint64_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 19) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_0; + tmp_0 = (register_0 >> 19) & ((1ULL << 19) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_0; + tmp_0 = (register_0 >> 38) & ((1ULL << 19) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_0; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 7; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 19) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_0; + tmp_0 = (register_0 >> 31) & ((1ULL << 19) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_0; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 14; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_0; + tmp_0 = (register_0 >> 5) & ((1ULL << 19) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 19) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_0; + tmp_0 = (register_0 >> 43) & ((1ULL << 19) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_0; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_0; + tmp_0 = (register_0 >> 17) & ((1ULL << 19) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 19) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_0; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 9; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 19) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_0; + tmp_0 = (register_0 >> 29) & ((1ULL << 19) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_0; + tmp_0 = (register_0 >> 3) & ((1ULL << 19) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 19) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_0; + tmp_0 = (register_0 >> 41) & ((1ULL << 19) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_0; + tmp_0 = (register_0 >> 15) & ((1ULL << 19) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_0; + tmp_0 = (register_0 >> 34) & ((1ULL << 19) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_0; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 11; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 19) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_0; + tmp_0 = (register_0 >> 27) & ((1ULL << 19) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_0; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 18; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_0; + tmp_0 = (register_0 >> 1) & ((1ULL << 19) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 19) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_0; + tmp_0 = (register_0 >> 39) & ((1ULL << 19) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_0; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_0; + tmp_0 = (register_0 >> 13) & ((1ULL << 19) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 19) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_0; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 13; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 19) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_0; + tmp_0 = (register_0 >> 25) & ((1ULL << 19) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 19) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_0; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 1; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 19) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_0; + tmp_0 = (register_0 >> 37) & ((1ULL << 19) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_0; + tmp_0 = (register_0 >> 11) & ((1ULL << 19) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 19) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_0; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 15; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 19) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_0; + tmp_0 = (register_0 >> 23) & ((1ULL << 19) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_0; + tmp_0 = (register_0 >> 42) & ((1ULL << 19) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_0; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 3; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 19) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_0; + tmp_0 = (register_0 >> 35) & ((1ULL << 19) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_0; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_0; + tmp_0 = (register_0 >> 9) & ((1ULL << 19) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 19) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_0; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 17; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 19) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_0; + tmp_0 = (register_0 >> 21) & ((1ULL << 19) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 19) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_0; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 5; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 19) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_0; + tmp_0 = (register_0 >> 33) & ((1ULL << 19) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_0; + tmp_0 = (register_0 >> 7) & ((1ULL << 19) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 19) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_0; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_0; + } +} +static void unffor_20bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + uint64_t* __restrict a_out_p, + const uint64_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 20) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 20) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 20) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 20) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 20) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 20) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 20) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 20) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 20) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 20) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 20) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 = (register_0) & ((1ULL << 20) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 20) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 20) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 20) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 20) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 20) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 20) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 20) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 20) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 20) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 20) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 = (register_0) & ((1ULL << 20) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 20) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 20) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 20) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 20) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 20) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 20) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 20) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 20) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 20) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 20) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 = (register_0) & ((1ULL << 20) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 20) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 20) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 20) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 20) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 20) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 20) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 20) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 20) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 20) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 20) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_0; + } +} +static void unffor_21bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + uint64_t* __restrict a_out_p, + const uint64_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 21) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_0; + tmp_0 = (register_0 >> 21) & ((1ULL << 21) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_0; + tmp_0 = (register_0 >> 42) & ((1ULL << 21) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_0; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 1; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 21) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_0; + tmp_0 = (register_0 >> 41) & ((1ULL << 21) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_0; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_0; + tmp_0 = (register_0 >> 19) & ((1ULL << 21) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 21) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_0; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 3; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 21) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_0; + tmp_0 = (register_0 >> 39) & ((1ULL << 21) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_0; + tmp_0 = (register_0 >> 17) & ((1ULL << 21) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_0; + tmp_0 = (register_0 >> 38) & ((1ULL << 21) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_0; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 5; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 21) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_0; + tmp_0 = (register_0 >> 37) & ((1ULL << 21) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_0; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_0; + tmp_0 = (register_0 >> 15) & ((1ULL << 21) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 21) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_0; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 7; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 21) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_0; + tmp_0 = (register_0 >> 35) & ((1ULL << 21) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_0; + tmp_0 = (register_0 >> 13) & ((1ULL << 21) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_0; + tmp_0 = (register_0 >> 34) & ((1ULL << 21) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_0; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 9; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 21) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_0; + tmp_0 = (register_0 >> 33) & ((1ULL << 21) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_0; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_0; + tmp_0 = (register_0 >> 11) & ((1ULL << 21) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 21) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_0; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 11; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 21) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_0; + tmp_0 = (register_0 >> 31) & ((1ULL << 21) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_0; + tmp_0 = (register_0 >> 9) & ((1ULL << 21) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 21) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_0; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 13; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 21) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_0; + tmp_0 = (register_0 >> 29) & ((1ULL << 21) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_0; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 14; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_0; + tmp_0 = (register_0 >> 7) & ((1ULL << 21) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 21) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_0; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 15; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 21) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_0; + tmp_0 = (register_0 >> 27) & ((1ULL << 21) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_0; + tmp_0 = (register_0 >> 5) & ((1ULL << 21) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 21) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_0; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 17; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 21) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_0; + tmp_0 = (register_0 >> 25) & ((1ULL << 21) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_0; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 18; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_0; + tmp_0 = (register_0 >> 3) & ((1ULL << 21) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 21) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_0; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 19; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 21) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_0; + tmp_0 = (register_0 >> 23) & ((1ULL << 21) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_0; + tmp_0 = (register_0 >> 1) & ((1ULL << 21) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 21) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_0; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_0; + } +} +static void unffor_22bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + uint64_t* __restrict a_out_p, + const uint64_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 22) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 22) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 22) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 22) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_0; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 18; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 22) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 22) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 22) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 22) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_0; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 14; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 22) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 22) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 22) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 22) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_0; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 22) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_0; + tmp_0 = (register_0 >> 34) & ((1ULL << 22) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 22) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 22) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_0; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 22) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_0; + tmp_0 = (register_0 >> 38) & ((1ULL << 22) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 22) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 22) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_0; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 22) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_0; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 = (register_0) & ((1ULL << 22) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 22) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 22) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 22) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_0; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 18; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 22) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 22) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 22) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 22) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_0; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 14; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 22) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 22) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 22) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 22) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_0; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 22) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_0; + tmp_0 = (register_0 >> 34) & ((1ULL << 22) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 22) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 22) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_0; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 22) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_0; + tmp_0 = (register_0 >> 38) & ((1ULL << 22) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 22) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 22) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_0; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 22) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_0; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_0; + } +} +static void unffor_23bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + uint64_t* __restrict a_out_p, + const uint64_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 23) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_0; + tmp_0 = (register_0 >> 23) & ((1ULL << 23) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_0; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 18; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_0; + tmp_0 = (register_0 >> 5) & ((1ULL << 23) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 23) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_0; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 13; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 23) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_0; + tmp_0 = (register_0 >> 33) & ((1ULL << 23) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_0; + tmp_0 = (register_0 >> 15) & ((1ULL << 23) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_0; + tmp_0 = (register_0 >> 38) & ((1ULL << 23) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_0; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 3; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 23) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_0; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 21; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 23) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_0; + tmp_0 = (register_0 >> 25) & ((1ULL << 23) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_0; + tmp_0 = (register_0 >> 7) & ((1ULL << 23) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 23) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_0; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 11; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 23) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_0; + tmp_0 = (register_0 >> 35) & ((1ULL << 23) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_0; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_0; + tmp_0 = (register_0 >> 17) & ((1ULL << 23) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 23) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_0; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 1; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 23) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_0; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 19; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 23) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_0; + tmp_0 = (register_0 >> 27) & ((1ULL << 23) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_0; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 14; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_0; + tmp_0 = (register_0 >> 9) & ((1ULL << 23) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 23) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_0; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 9; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 23) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_0; + tmp_0 = (register_0 >> 37) & ((1ULL << 23) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_0; + tmp_0 = (register_0 >> 19) & ((1ULL << 23) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_0; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 22; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_0; + tmp_0 = (register_0 >> 1) & ((1ULL << 23) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 23) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_0; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 17; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 23) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_0; + tmp_0 = (register_0 >> 29) & ((1ULL << 23) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_0; + tmp_0 = (register_0 >> 11) & ((1ULL << 23) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_0; + tmp_0 = (register_0 >> 34) & ((1ULL << 23) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_0; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 7; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 23) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_0; + tmp_0 = (register_0 >> 39) & ((1ULL << 23) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_0; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_0; + tmp_0 = (register_0 >> 21) & ((1ULL << 23) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_0; + tmp_0 = (register_0 >> 3) & ((1ULL << 23) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 23) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_0; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 15; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 23) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_0; + tmp_0 = (register_0 >> 31) & ((1ULL << 23) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_0; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_0; + tmp_0 = (register_0 >> 13) & ((1ULL << 23) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 23) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_0; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 5; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 23) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_0; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_0; + } +} +static void unffor_24bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + uint64_t* __restrict a_out_p, + const uint64_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 24) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 24) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 24) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 24) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 24) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 = (register_0) & ((1ULL << 24) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 24) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 24) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 24) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 24) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 = (register_0) & ((1ULL << 24) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 24) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 24) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 24) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 24) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 = (register_0) & ((1ULL << 24) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 24) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 24) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 24) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 24) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 = (register_0) & ((1ULL << 24) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 24) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 24) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 24) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 24) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 = (register_0) & ((1ULL << 24) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 24) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 24) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 24) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 24) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 = (register_0) & ((1ULL << 24) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 24) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 24) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 24) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 24) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 = (register_0) & ((1ULL << 24) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 24) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 24) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 24) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 24) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_0; + } +} +static void unffor_25bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + uint64_t* __restrict a_out_p, + const uint64_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 25) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_0; + tmp_0 = (register_0 >> 25) & ((1ULL << 25) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_0; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 14; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_0; + tmp_0 = (register_0 >> 11) & ((1ULL << 25) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 25) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_0; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 3; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 25) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_0; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 17; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 25) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_0; + tmp_0 = (register_0 >> 33) & ((1ULL << 25) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_0; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_0; + tmp_0 = (register_0 >> 19) & ((1ULL << 25) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_0; + tmp_0 = (register_0 >> 5) & ((1ULL << 25) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 25) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_0; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 9; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 25) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_0; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 23; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 25) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_0; + tmp_0 = (register_0 >> 27) & ((1ULL << 25) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_0; + tmp_0 = (register_0 >> 13) & ((1ULL << 25) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_0; + tmp_0 = (register_0 >> 38) & ((1ULL << 25) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_0; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 1; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 25) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_0; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 15; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 25) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_0; + tmp_0 = (register_0 >> 35) & ((1ULL << 25) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_0; + tmp_0 = (register_0 >> 21) & ((1ULL << 25) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_0; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 18; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_0; + tmp_0 = (register_0 >> 7) & ((1ULL << 25) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 25) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_0; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 7; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 25) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_0; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 21; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 25) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_0; + tmp_0 = (register_0 >> 29) & ((1ULL << 25) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_0; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_0; + tmp_0 = (register_0 >> 15) & ((1ULL << 25) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_0; + tmp_0 = (register_0 >> 1) & ((1ULL << 25) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 25) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_0; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 13; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 25) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_0; + tmp_0 = (register_0 >> 37) & ((1ULL << 25) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_0; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_0; + tmp_0 = (register_0 >> 23) & ((1ULL << 25) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_0; + tmp_0 = (register_0 >> 9) & ((1ULL << 25) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_0; + tmp_0 = (register_0 >> 34) & ((1ULL << 25) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_0; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 5; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 25) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_0; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 19; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 25) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_0; + tmp_0 = (register_0 >> 31) & ((1ULL << 25) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_0; + tmp_0 = (register_0 >> 17) & ((1ULL << 25) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_0; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 22; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_0; + tmp_0 = (register_0 >> 3) & ((1ULL << 25) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 25) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_0; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 11; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 25) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_0; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_0; + } +} +static void unffor_26bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + uint64_t* __restrict a_out_p, + const uint64_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 26) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 26) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 26) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 26) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 26) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_0; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 26) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_0; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 22; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 26) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 26) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 26) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 26) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 26) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_0; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 26) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_0; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 18; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 26) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_0; + tmp_0 = (register_0 >> 34) & ((1ULL << 26) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 26) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 26) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 26) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_0; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 26) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_0; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 14; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 26) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_0; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 = (register_0) & ((1ULL << 26) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 26) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 26) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 26) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 26) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_0; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 26) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_0; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 22; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 26) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 26) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 26) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 26) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 26) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_0; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 26) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_0; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 18; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 26) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_0; + tmp_0 = (register_0 >> 34) & ((1ULL << 26) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 26) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 26) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 26) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_0; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 26) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_0; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 14; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 26) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_0; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_0; + } +} +static void unffor_27bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + uint64_t* __restrict a_out_p, + const uint64_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 27) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_0; + tmp_0 = (register_0 >> 27) & ((1ULL << 27) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_0; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_0; + tmp_0 = (register_0 >> 17) & ((1ULL << 27) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_0; + tmp_0 = (register_0 >> 7) & ((1ULL << 27) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_0; + tmp_0 = (register_0 >> 34) & ((1ULL << 27) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_0; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 3; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 27) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_0; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 13; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 27) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_0; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 23; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 27) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_0; + tmp_0 = (register_0 >> 31) & ((1ULL << 27) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_0; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_0; + tmp_0 = (register_0 >> 21) & ((1ULL << 27) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_0; + tmp_0 = (register_0 >> 11) & ((1ULL << 27) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_0; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 26; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_0; + tmp_0 = (register_0 >> 1) & ((1ULL << 27) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 27) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_0; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 9; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 27) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_0; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 19; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 27) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_0; + tmp_0 = (register_0 >> 35) & ((1ULL << 27) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_0; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_0; + tmp_0 = (register_0 >> 25) & ((1ULL << 27) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_0; + tmp_0 = (register_0 >> 15) & ((1ULL << 27) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_0; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 22; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_0; + tmp_0 = (register_0 >> 5) & ((1ULL << 27) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 27) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_0; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 5; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 27) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_0; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 15; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 27) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_0; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 25; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 27) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_0; + tmp_0 = (register_0 >> 29) & ((1ULL << 27) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_0; + tmp_0 = (register_0 >> 19) & ((1ULL << 27) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_0; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 18; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_0; + tmp_0 = (register_0 >> 9) & ((1ULL << 27) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 27) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_0; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 1; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 27) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_0; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 11; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 27) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_0; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 21; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 27) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_0; + tmp_0 = (register_0 >> 33) & ((1ULL << 27) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_0; + tmp_0 = (register_0 >> 23) & ((1ULL << 27) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_0; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 14; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_0; + tmp_0 = (register_0 >> 13) & ((1ULL << 27) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_0; + tmp_0 = (register_0 >> 3) & ((1ULL << 27) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 27) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_0; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 7; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 27) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_0; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 17; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 27) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_0; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_0; + } +} +static void unffor_28bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + uint64_t* __restrict a_out_p, + const uint64_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 28) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 28) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 28) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 28) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 28) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 28) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 28) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 28) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 28) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 = (register_0) & ((1ULL << 28) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 28) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 28) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 28) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 28) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 28) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 28) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 28) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 28) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 = (register_0) & ((1ULL << 28) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 28) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 28) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 28) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 28) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 28) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 28) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 28) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 28) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 = (register_0) & ((1ULL << 28) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 28) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 28) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 28) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 28) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 28) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 28) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 28) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 28) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_0; + } +} +static void unffor_29bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + uint64_t* __restrict a_out_p, + const uint64_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 29) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_0; + tmp_0 = (register_0 >> 29) & ((1ULL << 29) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_0; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_0; + tmp_0 = (register_0 >> 23) & ((1ULL << 29) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_0; + tmp_0 = (register_0 >> 17) & ((1ULL << 29) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_0; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 18; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_0; + tmp_0 = (register_0 >> 11) & ((1ULL << 29) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_0; + tmp_0 = (register_0 >> 5) & ((1ULL << 29) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_0; + tmp_0 = (register_0 >> 34) & ((1ULL << 29) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_0; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 1; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 29) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_0; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 7; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 29) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_0; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 13; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 29) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_0; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 19; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 29) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_0; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 25; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 29) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_0; + tmp_0 = (register_0 >> 33) & ((1ULL << 29) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_0; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_0; + tmp_0 = (register_0 >> 27) & ((1ULL << 29) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_0; + tmp_0 = (register_0 >> 21) & ((1ULL << 29) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_0; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 14; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_0; + tmp_0 = (register_0 >> 15) & ((1ULL << 29) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_0; + tmp_0 = (register_0 >> 9) & ((1ULL << 29) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_0; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 26; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_0; + tmp_0 = (register_0 >> 3) & ((1ULL << 29) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 29) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_0; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 3; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 29) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_0; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 9; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 29) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_0; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 15; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 29) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_0; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 21; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 29) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_0; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 27; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 29) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_0; + tmp_0 = (register_0 >> 31) & ((1ULL << 29) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_0; + tmp_0 = (register_0 >> 25) & ((1ULL << 29) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_0; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_0; + tmp_0 = (register_0 >> 19) & ((1ULL << 29) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_0; + tmp_0 = (register_0 >> 13) & ((1ULL << 29) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_0; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 22; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_0; + tmp_0 = (register_0 >> 7) & ((1ULL << 29) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 28; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_0; + tmp_0 = (register_0 >> 1) & ((1ULL << 29) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 29) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_0; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 5; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 29) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_0; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 11; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 29) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_0; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 17; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 29) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_0; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 23; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 29) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_0; + tmp_0 = (register_0 >> 35) & ((1ULL << 29) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_0; + } +} +static void unffor_30bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + uint64_t* __restrict a_out_p, + const uint64_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 30) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 30) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 30) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 30) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 30) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 30) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 30) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 30) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 28; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 30) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 30) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_0; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 30) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_0; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 30) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_0; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 30) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_0; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 14; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 30) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_0; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 18; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 30) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_0; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 22; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 30) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_0; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 26; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 30) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_0; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 = (register_0) & ((1ULL << 30) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 30) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 30) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 30) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 30) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 30) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 30) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 30) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 28; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 30) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 30) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_0; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 30) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_0; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 30) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_0; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 30) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_0; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 14; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 30) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_0; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 18; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 30) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_0; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 22; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 30) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_0; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 26; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 30) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_0; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_0; + } +} +static void unffor_31bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + uint64_t* __restrict a_out_p, + const uint64_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 31) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_0; + tmp_0 = (register_0 >> 31) & ((1ULL << 31) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_0; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 29) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_0; + tmp_0 = (register_0 >> 29) & ((1ULL << 31) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_0; + tmp_0 = (register_0 >> 27) & ((1ULL << 31) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_0; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_0; + tmp_0 = (register_0 >> 25) & ((1ULL << 31) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_0; + tmp_0 = (register_0 >> 23) & ((1ULL << 31) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_0; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_0; + tmp_0 = (register_0 >> 21) & ((1ULL << 31) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_0; + tmp_0 = (register_0 >> 19) & ((1ULL << 31) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_0; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 14; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_0; + tmp_0 = (register_0 >> 17) & ((1ULL << 31) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_0; + tmp_0 = (register_0 >> 15) & ((1ULL << 31) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_0; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 18; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_0; + tmp_0 = (register_0 >> 13) & ((1ULL << 31) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_0; + tmp_0 = (register_0 >> 11) & ((1ULL << 31) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_0; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 22; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_0; + tmp_0 = (register_0 >> 9) & ((1ULL << 31) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_0; + tmp_0 = (register_0 >> 7) & ((1ULL << 31) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_0; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 26; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_0; + tmp_0 = (register_0 >> 5) & ((1ULL << 31) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 28; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_0; + tmp_0 = (register_0 >> 3) & ((1ULL << 31) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_0; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 30; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_0; + tmp_0 = (register_0 >> 1) & ((1ULL << 31) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 31) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_0; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 1; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 31) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_0; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 3; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 31) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_0; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 5; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 31) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_0; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 7; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 31) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_0; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 9; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 31) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_0; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 11; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 31) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_0; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 13; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 31) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_0; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 15; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 31) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_0; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 17; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 31) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_0; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 19; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 31) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_0; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 21; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 31) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_0; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 23; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 31) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_0; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 25; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 31) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_0; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 27; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 31) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_0; + tmp_0 = (register_0 >> 35) & ((1ULL << 29) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 29; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 31) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_0; + tmp_0 = (register_0 >> 33) & ((1ULL << 31) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_0; + } +} +static void unffor_32bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + uint64_t* __restrict a_out_p, + const uint64_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 = (register_0) & ((1ULL << 32) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_0; + } +} +static void unffor_33bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + uint64_t* __restrict a_out_p, + const uint64_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 33) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_0; + tmp_0 = (register_0 >> 33) & ((1ULL << 31) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 31; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 33) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_0; + tmp_0 = (register_0 >> 35) & ((1ULL << 29) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 29; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 33) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_0; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 27; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 33) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_0; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 25; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 33) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_0; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 23; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 33) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_0; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 21; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 33) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_0; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 19; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 33) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_0; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 17; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 33) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_0; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 15; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 33) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_0; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 13; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 33) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_0; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 11; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 33) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_0; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 9; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 33) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_0; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 7; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 33) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_0; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 5; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 33) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_0; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 3; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 33) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_0; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 1; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_0; + tmp_0 = (register_0 >> 1) & ((1ULL << 33) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_0; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 30; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_0; + tmp_0 = (register_0 >> 3) & ((1ULL << 33) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 28; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_0; + tmp_0 = (register_0 >> 5) & ((1ULL << 33) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_0; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 26; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_0; + tmp_0 = (register_0 >> 7) & ((1ULL << 33) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_0; + tmp_0 = (register_0 >> 9) & ((1ULL << 33) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_0; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 22; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_0; + tmp_0 = (register_0 >> 11) & ((1ULL << 33) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_0; + tmp_0 = (register_0 >> 13) & ((1ULL << 33) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_0; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 18; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_0; + tmp_0 = (register_0 >> 15) & ((1ULL << 33) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_0; + tmp_0 = (register_0 >> 17) & ((1ULL << 33) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_0; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 14; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_0; + tmp_0 = (register_0 >> 19) & ((1ULL << 33) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_0; + tmp_0 = (register_0 >> 21) & ((1ULL << 33) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_0; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_0; + tmp_0 = (register_0 >> 23) & ((1ULL << 33) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_0; + tmp_0 = (register_0 >> 25) & ((1ULL << 33) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_0; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_0; + tmp_0 = (register_0 >> 27) & ((1ULL << 33) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 29) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_0; + tmp_0 = (register_0 >> 29) & ((1ULL << 33) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_0; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 31) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_0; + tmp_0 = (register_0 >> 31) & ((1ULL << 33) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_0; + } +} +static void unffor_34bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + uint64_t* __restrict a_out_p, + const uint64_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 34) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_0; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 30; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 34) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_0; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 26; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 34) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_0; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 22; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 34) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_0; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 18; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 34) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_0; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 14; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 34) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_0; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 34) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_0; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 34) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_0; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 34) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 28; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 34) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 34) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 34) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 34) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 34) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 34) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 = (register_0) & ((1ULL << 34) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_0; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 30; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 34) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_0; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 26; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 34) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_0; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 22; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 34) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_0; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 18; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 34) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_0; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 14; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 34) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_0; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 34) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_0; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 34) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_0; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 34) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 28; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 34) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 34) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 34) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 34) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 34) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 34) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_0; + } +} +static void unffor_35bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + uint64_t* __restrict a_out_p, + const uint64_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 35) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_0; + tmp_0 = (register_0 >> 35) & ((1ULL << 29) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 29; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 35) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_0; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 23; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 35) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_0; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 17; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 35) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_0; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 11; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 35) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_0; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 5; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 34; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_0; + tmp_0 = (register_0 >> 1) & ((1ULL << 35) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 28; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_0; + tmp_0 = (register_0 >> 7) & ((1ULL << 35) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_0; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 22; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_0; + tmp_0 = (register_0 >> 13) & ((1ULL << 35) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_0; + tmp_0 = (register_0 >> 19) & ((1ULL << 35) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_0; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_0; + tmp_0 = (register_0 >> 25) & ((1ULL << 35) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 31) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_0; + tmp_0 = (register_0 >> 31) & ((1ULL << 33) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 33; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 35) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_0; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 27; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 35) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_0; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 21; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 35) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_0; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 15; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 35) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_0; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 9; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 35) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_0; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 3; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_0; + tmp_0 = (register_0 >> 3) & ((1ULL << 35) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_0; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 26; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_0; + tmp_0 = (register_0 >> 9) & ((1ULL << 35) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_0; + tmp_0 = (register_0 >> 15) & ((1ULL << 35) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_0; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 14; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_0; + tmp_0 = (register_0 >> 21) & ((1ULL << 35) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_0; + tmp_0 = (register_0 >> 27) & ((1ULL << 35) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_0; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 33) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_0; + tmp_0 = (register_0 >> 33) & ((1ULL << 31) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 31; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 35) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_0; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 25; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 35) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_0; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 19; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 35) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_0; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 13; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 35) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_0; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 7; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 35) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_0; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 1; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_0; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 30; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_0; + tmp_0 = (register_0 >> 5) & ((1ULL << 35) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_0; + tmp_0 = (register_0 >> 11) & ((1ULL << 35) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_0; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 18; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_0; + tmp_0 = (register_0 >> 17) & ((1ULL << 35) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_0; + tmp_0 = (register_0 >> 23) & ((1ULL << 35) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_0; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 29) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_0; + tmp_0 = (register_0 >> 29) & ((1ULL << 35) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_0; + } +} +static void unffor_36bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + uint64_t* __restrict a_out_p, + const uint64_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 36) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 28; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 36) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 36) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 36) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 36) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 36) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 36) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 = (register_0) & ((1ULL << 36) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 28; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 36) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 36) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 36) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 36) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 36) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 36) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 = (register_0) & ((1ULL << 36) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 28; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 36) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 36) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 36) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 36) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 36) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 36) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 = (register_0) & ((1ULL << 36) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 28; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 36) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 36) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 36) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 36) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 36) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 36) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_0; + } +} +static void unffor_37bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + uint64_t* __restrict a_out_p, + const uint64_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 37) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_0; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 27; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 37) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_0; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 17; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 37) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_0; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 7; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 34; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_0; + tmp_0 = (register_0 >> 3) & ((1ULL << 37) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_0; + tmp_0 = (register_0 >> 13) & ((1ULL << 37) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_0; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 14; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_0; + tmp_0 = (register_0 >> 23) & ((1ULL << 37) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 33) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_0; + tmp_0 = (register_0 >> 33) & ((1ULL << 31) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 31; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 37) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_0; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 21; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 37) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_0; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 11; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 37) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_0; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 1; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 28; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_0; + tmp_0 = (register_0 >> 9) & ((1ULL << 37) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_0; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 18; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_0; + tmp_0 = (register_0 >> 19) & ((1ULL << 37) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 29) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_0; + tmp_0 = (register_0 >> 29) & ((1ULL << 35) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 35; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 37) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_0; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 25; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 37) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_0; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 15; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 37) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_0; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 5; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_0; + tmp_0 = (register_0 >> 5) & ((1ULL << 37) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_0; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 22; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_0; + tmp_0 = (register_0 >> 15) & ((1ULL << 37) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_0; + tmp_0 = (register_0 >> 25) & ((1ULL << 37) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_0; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 35) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_0; + tmp_0 = (register_0 >> 35) & ((1ULL << 29) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 29; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 37) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_0; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 19; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 37) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_0; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 9; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 36; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_0; + tmp_0 = (register_0 >> 1) & ((1ULL << 37) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_0; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 26; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_0; + tmp_0 = (register_0 >> 11) & ((1ULL << 37) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_0; + tmp_0 = (register_0 >> 21) & ((1ULL << 37) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_0; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 31) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_0; + tmp_0 = (register_0 >> 31) & ((1ULL << 33) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 33; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 37) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_0; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 23; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 37) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_0; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 13; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 37) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_0; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 3; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_0; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 30; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_0; + tmp_0 = (register_0 >> 7) & ((1ULL << 37) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_0; + tmp_0 = (register_0 >> 17) & ((1ULL << 37) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_0; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_0; + tmp_0 = (register_0 >> 27) & ((1ULL << 37) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_0; + } +} +static void unffor_38bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + uint64_t* __restrict a_out_p, + const uint64_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 38) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_0; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 26; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 38) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_0; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 14; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 38) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_0; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 28; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 38) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 38) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_0; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 30; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 38) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_0; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 18; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 38) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_0; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 38) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 38) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 34; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 38) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_0; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 22; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 38) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_0; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 36; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 38) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 38) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 = (register_0) & ((1ULL << 38) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_0; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 26; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 38) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_0; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 14; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 38) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_0; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 28; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 38) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 38) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_0; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 30; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 38) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_0; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 18; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 38) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_0; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 38) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 38) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 34; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 38) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_0; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 22; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 38) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_0; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 36; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 38) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 38) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_0; + } +} +static void unffor_39bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + uint64_t* __restrict a_out_p, + const uint64_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 39) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_0; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 25; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 39) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_0; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 11; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 36; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_0; + tmp_0 = (register_0 >> 3) & ((1ULL << 39) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_0; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 22; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_0; + tmp_0 = (register_0 >> 17) & ((1ULL << 39) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 31) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_0; + tmp_0 = (register_0 >> 31) & ((1ULL << 33) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 33; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 39) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_0; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 19; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 39) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_0; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 5; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_0; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 30; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_0; + tmp_0 = (register_0 >> 9) & ((1ULL << 39) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_0; + tmp_0 = (register_0 >> 23) & ((1ULL << 39) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_0; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 37) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_0; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 27; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 39) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_0; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 13; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 38; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_0; + tmp_0 = (register_0 >> 1) & ((1ULL << 39) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_0; + tmp_0 = (register_0 >> 15) & ((1ULL << 39) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_0; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 29) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_0; + tmp_0 = (register_0 >> 29) & ((1ULL << 35) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 35; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 39) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_0; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 21; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 39) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_0; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 7; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_0; + tmp_0 = (register_0 >> 7) & ((1ULL << 39) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_0; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 18; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_0; + tmp_0 = (register_0 >> 21) & ((1ULL << 39) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 35) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_0; + tmp_0 = (register_0 >> 35) & ((1ULL << 29) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 29; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 39) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_0; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 15; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 39) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_0; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 1; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_0; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 26; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_0; + tmp_0 = (register_0 >> 13) & ((1ULL << 39) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_0; + tmp_0 = (register_0 >> 27) & ((1ULL << 37) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 37; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 39) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_0; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 23; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 39) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_0; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 9; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 34; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_0; + tmp_0 = (register_0 >> 5) & ((1ULL << 39) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_0; + tmp_0 = (register_0 >> 19) & ((1ULL << 39) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_0; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 33) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_0; + tmp_0 = (register_0 >> 33) & ((1ULL << 31) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 31; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 39) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_0; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 17; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 39) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_0; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 3; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 28; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_0; + tmp_0 = (register_0 >> 11) & ((1ULL << 39) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_0; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 14; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_0; + tmp_0 = (register_0 >> 25) & ((1ULL << 39) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_0; + } +} +static void unffor_40bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + uint64_t* __restrict a_out_p, + const uint64_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 40) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 40) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 40) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 = (register_0) & ((1ULL << 40) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 40) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 40) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 = (register_0) & ((1ULL << 40) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 40) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 40) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 = (register_0) & ((1ULL << 40) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 40) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 40) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 = (register_0) & ((1ULL << 40) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 40) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 40) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 = (register_0) & ((1ULL << 40) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 40) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 40) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 = (register_0) & ((1ULL << 40) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 40) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 40) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 = (register_0) & ((1ULL << 40) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 40) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 40) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_0; + } +} +static void unffor_41bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + uint64_t* __restrict a_out_p, + const uint64_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 41) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_0; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 23; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 41) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_0; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 5; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 28; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_0; + tmp_0 = (register_0 >> 13) & ((1ULL << 41) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_0; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 31) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_0; + tmp_0 = (register_0 >> 31) & ((1ULL << 33) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 33; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 41) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_0; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 15; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 38; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_0; + tmp_0 = (register_0 >> 3) & ((1ULL << 41) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_0; + tmp_0 = (register_0 >> 21) & ((1ULL << 41) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_0; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 39) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_0; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 25; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 41) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_0; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 7; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_0; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 30; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_0; + tmp_0 = (register_0 >> 11) & ((1ULL << 41) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 29) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_0; + tmp_0 = (register_0 >> 29) & ((1ULL << 35) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 35; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 41) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_0; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 17; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 40; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_0; + tmp_0 = (register_0 >> 1) & ((1ULL << 41) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_0; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 22; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_0; + tmp_0 = (register_0 >> 19) & ((1ULL << 41) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 37) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_0; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 27; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 41) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_0; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 9; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_0; + tmp_0 = (register_0 >> 9) & ((1ULL << 41) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_0; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 14; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_0; + tmp_0 = (register_0 >> 27) & ((1ULL << 37) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 37; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 41) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_0; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 19; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 41) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_0; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 1; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_0; + tmp_0 = (register_0 >> 17) & ((1ULL << 41) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_0; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 35) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_0; + tmp_0 = (register_0 >> 35) & ((1ULL << 29) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 29; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 41) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_0; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 11; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 34; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_0; + tmp_0 = (register_0 >> 7) & ((1ULL << 41) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_0; + tmp_0 = (register_0 >> 25) & ((1ULL << 39) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 39; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 41) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_0; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 21; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 41) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_0; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 3; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_0; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 26; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_0; + tmp_0 = (register_0 >> 15) & ((1ULL << 41) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 33) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_0; + tmp_0 = (register_0 >> 33) & ((1ULL << 31) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 31; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 41) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_0; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 13; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 36; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_0; + tmp_0 = (register_0 >> 5) & ((1ULL << 41) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_0; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 18; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_0; + tmp_0 = (register_0 >> 23) & ((1ULL << 41) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_0; + } +} +static void unffor_42bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + uint64_t* __restrict a_out_p, + const uint64_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 42) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_0; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 22; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 42) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_0; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 42) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_0; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 26; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 42) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_0; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 28; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 42) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_0; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 30; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 42) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_0; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 42) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 34; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 42) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_0; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 14; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 36; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 42) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 38; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 42) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_0; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 18; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 40; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 42) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 = (register_0) & ((1ULL << 42) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_0; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 22; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 42) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_0; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 42) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_0; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 26; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 42) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_0; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 28; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 42) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_0; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 30; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 42) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_0; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 42) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 34; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 42) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_0; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 14; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 36; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 42) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 38; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 42) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_0; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 18; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 40; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 42) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_0; + } +} +static void unffor_43bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + uint64_t* __restrict a_out_p, + const uint64_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 43) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_0; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 21; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 42; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_0; + tmp_0 = (register_0 >> 1) & ((1ULL << 43) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_0; + tmp_0 = (register_0 >> 23) & ((1ULL << 41) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 41; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 43) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_0; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 19; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 40; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_0; + tmp_0 = (register_0 >> 3) & ((1ULL << 43) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_0; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 18; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_0; + tmp_0 = (register_0 >> 25) & ((1ULL << 39) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 39; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 43) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_0; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 17; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 38; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_0; + tmp_0 = (register_0 >> 5) & ((1ULL << 43) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_0; + tmp_0 = (register_0 >> 27) & ((1ULL << 37) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 37; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 43) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_0; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 15; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 36; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_0; + tmp_0 = (register_0 >> 7) & ((1ULL << 43) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_0; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 29) - 1)) << 14; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_0; + tmp_0 = (register_0 >> 29) & ((1ULL << 35) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 35; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 43) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_0; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 13; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 34; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_0; + tmp_0 = (register_0 >> 9) & ((1ULL << 43) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 31) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_0; + tmp_0 = (register_0 >> 31) & ((1ULL << 33) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 33; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 43) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_0; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 11; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_0; + tmp_0 = (register_0 >> 11) & ((1ULL << 43) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_0; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 33) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_0; + tmp_0 = (register_0 >> 33) & ((1ULL << 31) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 31; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 43) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_0; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 9; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_0; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 30; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_0; + tmp_0 = (register_0 >> 13) & ((1ULL << 43) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 35) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_0; + tmp_0 = (register_0 >> 35) & ((1ULL << 29) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 29; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 43) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_0; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 7; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 28; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_0; + tmp_0 = (register_0 >> 15) & ((1ULL << 43) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_0; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 37) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_0; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 27; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 43) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_0; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 5; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_0; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 26; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_0; + tmp_0 = (register_0 >> 17) & ((1ULL << 43) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 39) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_0; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 25; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 43) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_0; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 3; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_0; + tmp_0 = (register_0 >> 19) & ((1ULL << 43) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_0; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 41) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_0; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 23; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 43) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_0; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 1; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_0; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 22; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_0; + tmp_0 = (register_0 >> 21) & ((1ULL << 43) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_0; + } +} +static void unffor_44bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + uint64_t* __restrict a_out_p, + const uint64_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 44) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 40; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 44) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 36; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 44) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 44) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 28; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 44) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 = (register_0) & ((1ULL << 44) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 40; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 44) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 36; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 44) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 44) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 28; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 44) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 = (register_0) & ((1ULL << 44) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 40; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 44) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 36; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 44) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 44) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 28; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 44) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 = (register_0) & ((1ULL << 44) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 40; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 44) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 36; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 44) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 44) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 28; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 44) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_0; + } +} +static void unffor_45bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + uint64_t* __restrict a_out_p, + const uint64_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 45) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_0; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 19; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 38; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_0; + tmp_0 = (register_0 >> 7) & ((1ULL << 45) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 33) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_0; + tmp_0 = (register_0 >> 33) & ((1ULL << 31) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 31; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 45) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_0; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 5; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_0; + tmp_0 = (register_0 >> 21) & ((1ULL << 43) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 43; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 45) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_0; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 17; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 36; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_0; + tmp_0 = (register_0 >> 9) & ((1ULL << 45) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_0; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 35) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_0; + tmp_0 = (register_0 >> 35) & ((1ULL << 29) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 29; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 45) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_0; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 3; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_0; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 22; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_0; + tmp_0 = (register_0 >> 23) & ((1ULL << 41) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 41; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 45) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_0; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 15; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 34; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_0; + tmp_0 = (register_0 >> 11) & ((1ULL << 45) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 37) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_0; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 27; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 45) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_0; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 1; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_0; + tmp_0 = (register_0 >> 25) & ((1ULL << 39) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 39; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 45) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_0; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 13; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_0; + tmp_0 = (register_0 >> 13) & ((1ULL << 45) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_0; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 39) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_0; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 25; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 44; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_0; + tmp_0 = (register_0 >> 1) & ((1ULL << 45) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_0; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 18; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_0; + tmp_0 = (register_0 >> 27) & ((1ULL << 37) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 37; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 45) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_0; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 11; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_0; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 30; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_0; + tmp_0 = (register_0 >> 15) & ((1ULL << 45) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 41) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_0; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 23; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 42; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_0; + tmp_0 = (register_0 >> 3) & ((1ULL << 45) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 29) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_0; + tmp_0 = (register_0 >> 29) & ((1ULL << 35) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 35; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 45) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_0; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 9; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 28; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_0; + tmp_0 = (register_0 >> 17) & ((1ULL << 45) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_0; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 43) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_0; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 21; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 40; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_0; + tmp_0 = (register_0 >> 5) & ((1ULL << 45) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_0; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 31) - 1)) << 14; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_0; + tmp_0 = (register_0 >> 31) & ((1ULL << 33) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 33; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 45) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_0; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 7; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_0; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 26; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_0; + tmp_0 = (register_0 >> 19) & ((1ULL << 45) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_0; + } +} +static void unffor_46bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + uint64_t* __restrict a_out_p, + const uint64_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 46) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_0; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 18; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 36; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 46) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_0; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 26; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 44; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 46) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 34; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 46) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_0; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 42; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 46) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_0; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 14; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 46) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_0; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 22; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 40; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 46) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_0; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 30; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 46) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_0; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 38; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 46) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_0; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 28; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 = (register_0) & ((1ULL << 46) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_0; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 18; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 36; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 46) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_0; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 26; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 44; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 46) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 34; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 46) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_0; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 42; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 46) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_0; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 14; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 46) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_0; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 22; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 40; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 46) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_0; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 30; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 46) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_0; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 38; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 46) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_0; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 720); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 28; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_0; + } +} +static void unffor_47bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + uint64_t* __restrict a_out_p, + const uint64_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 47) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_0; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 17; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 34; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_0; + tmp_0 = (register_0 >> 13) & ((1ULL << 47) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 43) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_0; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 21; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 38; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_0; + tmp_0 = (register_0 >> 9) & ((1ULL << 47) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 39) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_0; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 25; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 42; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_0; + tmp_0 = (register_0 >> 5) & ((1ULL << 47) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 35) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_0; + tmp_0 = (register_0 >> 35) & ((1ULL << 29) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 29; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 46; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_0; + tmp_0 = (register_0 >> 1) & ((1ULL << 47) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 31) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_0; + tmp_0 = (register_0 >> 31) & ((1ULL << 33) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 33; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 47) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_0; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 3; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_0; + tmp_0 = (register_0 >> 27) & ((1ULL << 37) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 37; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 47) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_0; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 7; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_0; + tmp_0 = (register_0 >> 23) & ((1ULL << 41) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 41; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 47) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_0; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 11; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 28; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_0; + tmp_0 = (register_0 >> 19) & ((1ULL << 45) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 45; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 47) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_0; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 15; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_0; + tmp_0 = (register_0 >> 15) & ((1ULL << 47) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_0; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 45) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_0; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 19; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 36; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_0; + tmp_0 = (register_0 >> 11) & ((1ULL << 47) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_0; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 41) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_0; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 23; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 40; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_0; + tmp_0 = (register_0 >> 7) & ((1ULL << 47) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_0; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 37) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_0; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 27; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 44; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_0; + tmp_0 = (register_0 >> 3) & ((1ULL << 47) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_0; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 33) - 1)) << 14; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_0; + tmp_0 = (register_0 >> 33) & ((1ULL << 31) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 31; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 47) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_0; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 46) - 1)) << 1; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_0; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 29) - 1)) << 18; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_0; + tmp_0 = (register_0 >> 29) & ((1ULL << 35) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 35; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 47) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_0; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 5; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_0; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 22; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_0; + tmp_0 = (register_0 >> 25) & ((1ULL << 39) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 39; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 47) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_0; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 9; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_0; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 26; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_0; + tmp_0 = (register_0 >> 21) & ((1ULL << 43) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 43; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 47) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_0; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 720); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 13; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_0; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 30; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_0; + tmp_0 = (register_0 >> 17) & ((1ULL << 47) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_0; + } +} +static void unffor_48bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + uint64_t* __restrict a_out_p, + const uint64_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 48) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 = (register_0) & ((1ULL << 48) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 = (register_0) & ((1ULL << 48) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 = (register_0) & ((1ULL << 48) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 = (register_0) & ((1ULL << 48) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 = (register_0) & ((1ULL << 48) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 = (register_0) & ((1ULL << 48) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 = (register_0) & ((1ULL << 48) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 = (register_0) & ((1ULL << 48) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 = (register_0) & ((1ULL << 48) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 = (register_0) & ((1ULL << 48) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 = (register_0) & ((1ULL << 48) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 = (register_0) & ((1ULL << 48) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 = (register_0) & ((1ULL << 48) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 = (register_0) & ((1ULL << 48) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 720); + tmp_0 = (register_0) & ((1ULL << 48) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 752); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_0; + } +} +static void unffor_49bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + uint64_t* __restrict a_out_p, + const uint64_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 49) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_0; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 15; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_0; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 30; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_0; + tmp_0 = (register_0 >> 19) & ((1ULL << 45) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 45; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 49) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_0; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 11; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_0; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 26; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_0; + tmp_0 = (register_0 >> 23) & ((1ULL << 41) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 41; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 49) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_0; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 7; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_0; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 22; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_0; + tmp_0 = (register_0 >> 27) & ((1ULL << 37) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 37; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 49) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_0; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 46) - 1)) << 3; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_0; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 31) - 1)) << 18; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_0; + tmp_0 = (register_0 >> 31) & ((1ULL << 33) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 33; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 48; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_0; + tmp_0 = (register_0 >> 1) & ((1ULL << 49) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_0; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 35) - 1)) << 14; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_0; + tmp_0 = (register_0 >> 35) & ((1ULL << 29) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 29; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 44; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_0; + tmp_0 = (register_0 >> 5) & ((1ULL << 49) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_0; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 39) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_0; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 25; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 40; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_0; + tmp_0 = (register_0 >> 9) & ((1ULL << 49) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_0; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 43) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_0; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 21; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 36; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_0; + tmp_0 = (register_0 >> 13) & ((1ULL << 49) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_0; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 47) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_0; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 17; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_0; + tmp_0 = (register_0 >> 17) & ((1ULL << 47) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 47; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 49) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_0; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 13; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 28; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_0; + tmp_0 = (register_0 >> 21) & ((1ULL << 43) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 43; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 49) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_0; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 9; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_0; + tmp_0 = (register_0 >> 25) & ((1ULL << 39) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 39; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 49) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_0; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 5; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 29) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_0; + tmp_0 = (register_0 >> 29) & ((1ULL << 35) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 35; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 49) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_0; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 1; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 33) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_0; + tmp_0 = (register_0 >> 33) & ((1ULL << 31) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 31; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 46; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_0; + tmp_0 = (register_0 >> 3) & ((1ULL << 49) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 37) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_0; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 27; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 42; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_0; + tmp_0 = (register_0 >> 7) & ((1ULL << 49) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 41) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_0; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 23; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 720); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 38; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_0; + tmp_0 = (register_0 >> 11) & ((1ULL << 49) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 45) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_0; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 752); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 19; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 768); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 34; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_0; + tmp_0 = (register_0 >> 15) & ((1ULL << 49) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_0; + } +} +static void unffor_50bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + uint64_t* __restrict a_out_p, + const uint64_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 50) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_0; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 14; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 28; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 42; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 50) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_0; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 34; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 48; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 50) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_0; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 26; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 40; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 50) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 46) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_0; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 18; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 46; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 50) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_0; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 38; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 50) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_0; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_0; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 30; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 44; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 50) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_0; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 22; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 36; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 50) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 = (register_0) & ((1ULL << 50) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_0; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 14; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 28; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 42; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 50) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_0; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 34; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 48; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 50) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_0; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 26; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 40; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 50) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 46) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_0; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 18; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 46; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 50) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_0; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 38; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 50) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_0; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_0; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 720); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 30; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 44; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 50) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 752); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_0; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 768); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 22; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 784); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 36; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 50) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_0; + } +} +static void unffor_51bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + uint64_t* __restrict a_out_p, + const uint64_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 51) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_0; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 13; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_0; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 26; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_0; + tmp_0 = (register_0 >> 25) & ((1ULL << 39) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 39; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 51) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_0; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 50) - 1)) << 1; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_0; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 37) - 1)) << 14; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_0; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 27; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 40; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_0; + tmp_0 = (register_0 >> 11) & ((1ULL << 51) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_0; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 49) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_0; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 15; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 28; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_0; + tmp_0 = (register_0 >> 23) & ((1ULL << 41) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 41; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 51) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_0; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 3; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 35) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_0; + tmp_0 = (register_0 >> 35) & ((1ULL << 29) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 29; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 42; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_0; + tmp_0 = (register_0 >> 9) & ((1ULL << 51) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 47) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_0; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 17; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_0; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 30; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_0; + tmp_0 = (register_0 >> 21) & ((1ULL << 43) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 43; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 51) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_0; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 46) - 1)) << 5; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_0; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 33) - 1)) << 18; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_0; + tmp_0 = (register_0 >> 33) & ((1ULL << 31) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 31; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 44; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_0; + tmp_0 = (register_0 >> 7) & ((1ULL << 51) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_0; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 45) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_0; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 19; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_0; + tmp_0 = (register_0 >> 19) & ((1ULL << 45) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 45; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 51) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_0; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 7; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 31) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_0; + tmp_0 = (register_0 >> 31) & ((1ULL << 33) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 33; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 46; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_0; + tmp_0 = (register_0 >> 5) & ((1ULL << 51) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 43) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_0; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 21; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 34; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_0; + tmp_0 = (register_0 >> 17) & ((1ULL << 47) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 47; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 51) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_0; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 9; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_0; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 29) - 1)) << 22; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_0; + tmp_0 = (register_0 >> 29) & ((1ULL << 35) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 35; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 48; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_0; + tmp_0 = (register_0 >> 3) & ((1ULL << 51) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_0; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 41) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_0; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 23; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 36; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_0; + tmp_0 = (register_0 >> 15) & ((1ULL << 49) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 49; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 51) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_0; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 11; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 720); + tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_0; + tmp_0 = (register_0 >> 27) & ((1ULL << 37) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 37; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 50) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 752); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 50; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_0; + tmp_0 = (register_0 >> 1) & ((1ULL << 51) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 768); + tmp_0 |= ((register_0) & ((1ULL << 39) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_0; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 784); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 25; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 800); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 38; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_0; + tmp_0 = (register_0 >> 13) & ((1ULL << 51) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_0; + } +} +static void unffor_52bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + uint64_t* __restrict a_out_p, + const uint64_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 52) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 36; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 48; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 52) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 44; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 52) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 28; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 40; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 = (register_0) & ((1ULL << 52) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 36; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 48; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 52) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 44; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 52) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 28; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 40; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 = (register_0) & ((1ULL << 52) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 36; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 48; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 52) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 44; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 52) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 28; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 40; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 = (register_0) & ((1ULL << 52) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 36; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 48; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 52) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 720); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 752); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 44; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 52) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 768); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 784); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 800); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 28; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 816); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 40; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_0; + } +} +static void unffor_53bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + uint64_t* __restrict a_out_p, + const uint64_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 53) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_0; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 11; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_0; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 31) - 1)) << 22; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_0; + tmp_0 = (register_0 >> 31) & ((1ULL << 33) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 33; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 44; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_0; + tmp_0 = (register_0 >> 9) & ((1ULL << 53) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_0; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 51) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_0; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 13; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 29) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_0; + tmp_0 = (register_0 >> 29) & ((1ULL << 35) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 35; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 46; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_0; + tmp_0 = (register_0 >> 7) & ((1ULL << 53) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 49) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_0; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 15; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_0; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 26; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_0; + tmp_0 = (register_0 >> 27) & ((1ULL << 37) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 37; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 48; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_0; + tmp_0 = (register_0 >> 5) & ((1ULL << 53) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_0; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 47) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_0; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 17; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 28; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_0; + tmp_0 = (register_0 >> 25) & ((1ULL << 39) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 39; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 50) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 50; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_0; + tmp_0 = (register_0 >> 3) & ((1ULL << 53) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 45) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_0; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 19; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_0; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 30; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_0; + tmp_0 = (register_0 >> 23) & ((1ULL << 41) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 41; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 52; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_0; + tmp_0 = (register_0 >> 1) & ((1ULL << 53) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_0; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 43) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_0; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 21; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_0; + tmp_0 = (register_0 >> 21) & ((1ULL << 43) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 43; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 53) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_0; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 52) - 1)) << 1; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 41) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_0; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 23; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 34; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_0; + tmp_0 = (register_0 >> 19) & ((1ULL << 45) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 45; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 53) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_0; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 50) - 1)) << 3; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_0; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 39) - 1)) << 14; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_0; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 25; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 36; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_0; + tmp_0 = (register_0 >> 17) & ((1ULL << 47) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 47; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 53) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_0; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 5; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 37) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_0; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 27; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 38; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_0; + tmp_0 = (register_0 >> 15) & ((1ULL << 49) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 49; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 53) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_0; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 46) - 1)) << 7; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_0; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 720); + tmp_0 |= ((register_0) & ((1ULL << 35) - 1)) << 18; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_0; + tmp_0 = (register_0 >> 35) & ((1ULL << 29) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 29; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 752); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 40; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_0; + tmp_0 = (register_0 >> 13) & ((1ULL << 51) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 768); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 51; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 53) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_0; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 784); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 9; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 800); + tmp_0 |= ((register_0) & ((1ULL << 33) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_0; + tmp_0 = (register_0 >> 33) & ((1ULL << 31) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 816); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 31; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 832); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 42; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_0; + tmp_0 = (register_0 >> 11) & ((1ULL << 53) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_0; + } +} +static void unffor_54bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + uint64_t* __restrict a_out_p, + const uint64_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 54) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_0; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_0; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 30; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 40; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 50) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 50; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 54) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_0; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_0; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 26; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 36; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 46; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 54) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_0; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 52) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_0; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 22; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 42; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 52; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 54) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 46) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_0; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 18; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 28; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 38; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 48; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 54) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 50) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_0; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 14; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 34; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 44; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 54) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 = (register_0) & ((1ULL << 54) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_0; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_0; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 30; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 40; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 50) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 50; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 54) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_0; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_0; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 26; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 36; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 46; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 54) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_0; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 52) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_0; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 22; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 42; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 52; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 54) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 46) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_0; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 720); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 18; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 28; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 752); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 38; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 768); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 48; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 54) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 784); + tmp_0 |= ((register_0) & ((1ULL << 50) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_0; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 800); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 14; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 816); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 832); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 34; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 848); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 44; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 54) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_0; + } +} +static void unffor_55bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + uint64_t* __restrict a_out_p, + const uint64_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 55) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_0; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 46) - 1)) << 9; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_0; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 37) - 1)) << 18; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_0; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 27; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 36; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_0; + tmp_0 = (register_0 >> 19) & ((1ULL << 45) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 45; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 54) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 54; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_0; + tmp_0 = (register_0 >> 1) & ((1ULL << 55) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 47) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_0; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 17; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_0; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 29) - 1)) << 26; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_0; + tmp_0 = (register_0 >> 29) & ((1ULL << 35) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 35; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 44; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_0; + tmp_0 = (register_0 >> 11) & ((1ULL << 53) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 53; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 55) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_0; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 7; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 39) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_0; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 25; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 34; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_0; + tmp_0 = (register_0 >> 21) & ((1ULL << 43) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 43; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 52; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_0; + tmp_0 = (register_0 >> 3) & ((1ULL << 55) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_0; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 49) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_0; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 15; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 31) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_0; + tmp_0 = (register_0 >> 31) & ((1ULL << 33) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 33; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 42; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_0; + tmp_0 = (register_0 >> 13) & ((1ULL << 51) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 51; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 55) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_0; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 50) - 1)) << 5; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_0; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 41) - 1)) << 14; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_0; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 23; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_0; + tmp_0 = (register_0 >> 23) & ((1ULL << 41) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 41; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 50) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 50; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_0; + tmp_0 = (register_0 >> 5) & ((1ULL << 55) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 51) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_0; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 13; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_0; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 33) - 1)) << 22; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_0; + tmp_0 = (register_0 >> 33) & ((1ULL << 31) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 31; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 40; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_0; + tmp_0 = (register_0 >> 15) & ((1ULL << 49) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 49; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 55) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_0; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 52) - 1)) << 3; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 43) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_0; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 21; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_0; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 30; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_0; + tmp_0 = (register_0 >> 25) & ((1ULL << 39) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 39; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 48; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_0; + tmp_0 = (register_0 >> 7) & ((1ULL << 55) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_0; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 53) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_0; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 11; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 720); + tmp_0 |= ((register_0) & ((1ULL << 35) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_0; + tmp_0 = (register_0 >> 35) & ((1ULL << 29) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 29; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 752); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 38; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_0; + tmp_0 = (register_0 >> 17) & ((1ULL << 47) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 768); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 47; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 55) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_0; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 784); + tmp_0 |= ((register_0) & ((1ULL << 54) - 1)) << 1; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_0; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 800); + tmp_0 |= ((register_0) & ((1ULL << 45) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_0; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 816); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 19; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 832); + tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 28; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_0; + tmp_0 = (register_0 >> 27) & ((1ULL << 37) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 848); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 37; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 864); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 46; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_0; + tmp_0 = (register_0 >> 9) & ((1ULL << 55) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_0; + } +} +static void unffor_56bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + uint64_t* __restrict a_out_p, + const uint64_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 56) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 40; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 48; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 = (register_0) & ((1ULL << 56) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 40; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 48; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 = (register_0) & ((1ULL << 56) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 40; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 48; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 = (register_0) & ((1ULL << 56) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 40; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 48; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 = (register_0) & ((1ULL << 56) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 40; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 48; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 = (register_0) & ((1ULL << 56) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 40; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 48; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 = (register_0) & ((1ULL << 56) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 720); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 752); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 40; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 768); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 48; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 784); + tmp_0 = (register_0) & ((1ULL << 56) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 800); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 816); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 832); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 848); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 864); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 40; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 880); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 48; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_0; + } +} +static void unffor_57bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + uint64_t* __restrict a_out_p, + const uint64_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 57) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_0; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 50) - 1)) << 7; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_0; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 43) - 1)) << 14; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_0; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 21; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 29) - 1)) << 28; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_0; + tmp_0 = (register_0 >> 29) & ((1ULL << 35) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 35; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 42; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_0; + tmp_0 = (register_0 >> 15) & ((1ULL << 49) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 49; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 56; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_0; + tmp_0 = (register_0 >> 1) & ((1ULL << 57) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_0; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 51) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_0; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 13; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 37) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_0; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 27; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 34; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_0; + tmp_0 = (register_0 >> 23) & ((1ULL << 41) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 41; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 48; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_0; + tmp_0 = (register_0 >> 9) & ((1ULL << 55) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 55; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 57) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_0; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 52) - 1)) << 5; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 45) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_0; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 19; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_0; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 31) - 1)) << 26; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_0; + tmp_0 = (register_0 >> 31) & ((1ULL << 33) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 33; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 40; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_0; + tmp_0 = (register_0 >> 17) & ((1ULL << 47) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 47; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 54) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 54; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_0; + tmp_0 = (register_0 >> 3) & ((1ULL << 57) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 53) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_0; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 46) - 1)) << 11; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_0; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 39) - 1)) << 18; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_0; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 25; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_0; + tmp_0 = (register_0 >> 25) & ((1ULL << 39) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 39; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 46; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_0; + tmp_0 = (register_0 >> 11) & ((1ULL << 53) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 53; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 57) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_0; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 54) - 1)) << 3; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_0; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 47) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_0; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 17; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 33) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_0; + tmp_0 = (register_0 >> 33) & ((1ULL << 31) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 31; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 38; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_0; + tmp_0 = (register_0 >> 19) & ((1ULL << 45) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 45; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 52; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_0; + tmp_0 = (register_0 >> 5) & ((1ULL << 57) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_0; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 55) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_0; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 9; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 41) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_0; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 23; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_0; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 720); + tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 30; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_0; + tmp_0 = (register_0 >> 27) & ((1ULL << 37) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 37; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 752); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 44; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_0; + tmp_0 = (register_0 >> 13) & ((1ULL << 51) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 768); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 51; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 57) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_0; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 784); + tmp_0 |= ((register_0) & ((1ULL << 56) - 1)) << 1; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 800); + tmp_0 |= ((register_0) & ((1ULL << 49) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_0; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 816); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 15; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_0; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 832); + tmp_0 |= ((register_0) & ((1ULL << 35) - 1)) << 22; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_0; + tmp_0 = (register_0 >> 35) & ((1ULL << 29) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 848); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 29; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 864); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 36; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_0; + tmp_0 = (register_0 >> 21) & ((1ULL << 43) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 880); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 43; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 50) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 896); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 50; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_0; + tmp_0 = (register_0 >> 7) & ((1ULL << 57) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_0; + } +} +static void unffor_58bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + uint64_t* __restrict a_out_p, + const uint64_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 58) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_0; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 52) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 46) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_0; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 18; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_0; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 30; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 36; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 42; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 48; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 54) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 54; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 58) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_0; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 56) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 50) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_0; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 14; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_0; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 26; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 38; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 44; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 50) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 50; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 56; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 58) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 54) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_0; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_0; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 22; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 28; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 34; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 40; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 46; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 52; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 58) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 = (register_0) & ((1ULL << 58) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_0; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 52) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 46) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_0; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 18; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_0; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 30; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 36; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 42; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 48; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 54) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 54; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 58) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_0; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 56) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 50) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_0; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 14; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_0; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 26; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 720); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 38; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 44; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 50) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 752); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 50; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 768); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 56; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 58) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 784); + tmp_0 |= ((register_0) & ((1ULL << 54) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_0; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 800); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 816); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_0; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 832); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 22; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 848); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 28; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 864); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 34; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 880); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 40; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 896); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 46; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 912); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 52; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 58) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_0; + } +} +static void unffor_59bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + uint64_t* __restrict a_out_p, + const uint64_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 59) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_0; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 54) - 1)) << 5; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_0; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 49) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_0; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 15; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 39) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_0; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 25; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_0; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 29) - 1)) << 30; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_0; + tmp_0 = (register_0 >> 29) & ((1ULL << 35) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 35; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 40; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_0; + tmp_0 = (register_0 >> 19) & ((1ULL << 45) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 45; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 50) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 50; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_0; + tmp_0 = (register_0 >> 9) & ((1ULL << 55) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 55; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 59) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_0; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 58) - 1)) << 1; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_0; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 53) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_0; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 11; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 43) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_0; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 21; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_0; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 33) - 1)) << 26; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_0; + tmp_0 = (register_0 >> 33) & ((1ULL << 31) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 31; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 36; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_0; + tmp_0 = (register_0 >> 23) & ((1ULL << 41) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 41; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 46; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_0; + tmp_0 = (register_0 >> 13) & ((1ULL << 51) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 51; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 56; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_0; + tmp_0 = (register_0 >> 3) & ((1ULL << 59) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_0; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 57) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_0; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 52) - 1)) << 7; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 47) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_0; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 17; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_0; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 37) - 1)) << 22; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_0; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 27; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_0; + tmp_0 = (register_0 >> 27) & ((1ULL << 37) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 37; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 42; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_0; + tmp_0 = (register_0 >> 17) & ((1ULL << 47) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 47; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 52; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_0; + tmp_0 = (register_0 >> 7) & ((1ULL << 57) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 57; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 59) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_0; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 56) - 1)) << 3; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 51) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_0; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 46) - 1)) << 13; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_0; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 41) - 1)) << 18; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_0; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 23; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 31) - 1)) << 28; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_0; + tmp_0 = (register_0 >> 31) & ((1ULL << 33) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 33; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 38; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_0; + tmp_0 = (register_0 >> 21) & ((1ULL << 43) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 43; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 720); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 48; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_0; + tmp_0 = (register_0 >> 11) & ((1ULL << 53) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 53; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 58) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 752); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 58; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_0; + tmp_0 = (register_0 >> 1) & ((1ULL << 59) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 768); + tmp_0 |= ((register_0) & ((1ULL << 55) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_0; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 784); + tmp_0 |= ((register_0) & ((1ULL << 50) - 1)) << 9; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_0; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 800); + tmp_0 |= ((register_0) & ((1ULL << 45) - 1)) << 14; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_0; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 816); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 19; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 832); + tmp_0 |= ((register_0) & ((1ULL << 35) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_0; + tmp_0 = (register_0 >> 35) & ((1ULL << 29) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 848); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 29; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 864); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 34; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_0; + tmp_0 = (register_0 >> 25) & ((1ULL << 39) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 880); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 39; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 896); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 44; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_0; + tmp_0 = (register_0 >> 15) & ((1ULL << 49) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 912); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 49; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 54) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 928); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 54; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_0; + tmp_0 = (register_0 >> 5) & ((1ULL << 59) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_0; + } +} +static void unffor_60bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + uint64_t* __restrict a_out_p, + const uint64_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 60) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 56) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 52) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 28; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 36; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 40; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 44; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 48; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 52; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 56; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 60) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 = (register_0) & ((1ULL << 60) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 56) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 52) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 28; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 36; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 40; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 44; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 48; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 52; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 56; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 60) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 = (register_0) & ((1ULL << 60) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 56) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 52) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 28; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 36; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 40; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 44; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 48; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 52; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 56; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 60) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 720); + tmp_0 = (register_0) & ((1ULL << 60) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 56) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 752); + tmp_0 |= ((register_0) & ((1ULL << 52) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 768); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 784); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 800); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 816); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 832); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 28; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 848); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 864); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 36; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 880); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 40; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 896); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 44; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 912); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 48; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 928); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 52; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 944); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 56; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 60) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_0; + } +} +static void unffor_61bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + uint64_t* __restrict a_out_p, + const uint64_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 61) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_0; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 58) - 1)) << 3; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_0; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 55) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_0; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 52) - 1)) << 9; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 49) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_0; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 46) - 1)) << 15; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_0; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 43) - 1)) << 18; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_0; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 21; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 37) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_0; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 27; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_0; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 31) - 1)) << 30; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_0; + tmp_0 = (register_0 >> 31) & ((1ULL << 33) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 33; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 36; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_0; + tmp_0 = (register_0 >> 25) & ((1ULL << 39) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 39; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 42; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_0; + tmp_0 = (register_0 >> 19) & ((1ULL << 45) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 45; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 48; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_0; + tmp_0 = (register_0 >> 13) & ((1ULL << 51) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 51; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 54) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 54; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_0; + tmp_0 = (register_0 >> 7) & ((1ULL << 57) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 57; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 60) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 60; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_0; + tmp_0 = (register_0 >> 1) & ((1ULL << 61) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_0; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 59) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_0; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 56) - 1)) << 5; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 53) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_0; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 50) - 1)) << 11; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_0; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 47) - 1)) << 14; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_0; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 17; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 41) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_0; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 23; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_0; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 35) - 1)) << 26; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_0; + tmp_0 = (register_0 >> 35) & ((1ULL << 29) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 29; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 29) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_0; + tmp_0 = (register_0 >> 29) & ((1ULL << 35) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 35; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 38; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_0; + tmp_0 = (register_0 >> 23) & ((1ULL << 41) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 41; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 44; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_0; + tmp_0 = (register_0 >> 17) & ((1ULL << 47) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 47; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 50) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 50; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_0; + tmp_0 = (register_0 >> 11) & ((1ULL << 53) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 53; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 56; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_0; + tmp_0 = (register_0 >> 5) & ((1ULL << 59) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 59; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 61) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_0; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 60) - 1)) << 1; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 57) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_0; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 54) - 1)) << 7; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_0; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 51) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_0; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 720); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 13; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 45) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_0; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 752); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 19; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_0; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 768); + tmp_0 |= ((register_0) & ((1ULL << 39) - 1)) << 22; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_0; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 784); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 25; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 800); + tmp_0 |= ((register_0) & ((1ULL << 33) - 1)) << 28; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_0; + tmp_0 = (register_0 >> 33) & ((1ULL << 31) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 816); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 31; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 832); + tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 34; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_0; + tmp_0 = (register_0 >> 27) & ((1ULL << 37) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 848); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 37; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 864); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 40; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_0; + tmp_0 = (register_0 >> 21) & ((1ULL << 43) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 880); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 43; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 896); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 46; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_0; + tmp_0 = (register_0 >> 15) & ((1ULL << 49) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 912); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 49; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 928); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 52; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_0; + tmp_0 = (register_0 >> 9) & ((1ULL << 55) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 944); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 55; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 58) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 960); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 58; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_0; + tmp_0 = (register_0 >> 3) & ((1ULL << 61) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_0; + } +} +static void unffor_62bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + uint64_t* __restrict a_out_p, + const uint64_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 62) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_0; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 60) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 58) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_0; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 56) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 54) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_0; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 52) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 50) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_0; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 14; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 46) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_0; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 18; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_0; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 22; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_0; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 26; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 28; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_0; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 30; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 34; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 36; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 38; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 40; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 42; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 44; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 46; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 48; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 50) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 50; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 52; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 54) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 54; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 56; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 58) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 58; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 60) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 60; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 62) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_0; + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 = (register_0) & ((1ULL << 62) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_0; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 60) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 58) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_0; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 56) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 54) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_0; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 52) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 50) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_0; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 14; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 46) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_0; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 18; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_0; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 22; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_0; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 26; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 720); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 28; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_0; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 30; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 752); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 768); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 34; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 784); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 36; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 800); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 38; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 816); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 40; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 832); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 42; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 848); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 44; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 864); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 46; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 880); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 48; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 50) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 896); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 50; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 912); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 52; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 54) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 928); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 54; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 944); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 56; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 58) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 960); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 58; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 60) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 976); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 60; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 62) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_0; + } +} +static void unffor_63bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + uint64_t* __restrict a_out_p, + const uint64_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + tmp_0 = (register_0) & ((1ULL << 63) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = tmp_0; + tmp_0 = (register_0 >> 63) & ((1ULL << 1) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 16); + tmp_0 |= ((register_0) & ((1ULL << 62) - 1)) << 1; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = tmp_0; + tmp_0 = (register_0 >> 62) & ((1ULL << 2) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 32); + tmp_0 |= ((register_0) & ((1ULL << 61) - 1)) << 2; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = tmp_0; + tmp_0 = (register_0 >> 61) & ((1ULL << 3) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 48); + tmp_0 |= ((register_0) & ((1ULL << 60) - 1)) << 3; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = tmp_0; + tmp_0 = (register_0 >> 60) & ((1ULL << 4) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 64); + tmp_0 |= ((register_0) & ((1ULL << 59) - 1)) << 4; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = tmp_0; + tmp_0 = (register_0 >> 59) & ((1ULL << 5) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 80); + tmp_0 |= ((register_0) & ((1ULL << 58) - 1)) << 5; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = tmp_0; + tmp_0 = (register_0 >> 58) & ((1ULL << 6) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 96); + tmp_0 |= ((register_0) & ((1ULL << 57) - 1)) << 6; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = tmp_0; + tmp_0 = (register_0 >> 57) & ((1ULL << 7) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 112); + tmp_0 |= ((register_0) & ((1ULL << 56) - 1)) << 7; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = tmp_0; + tmp_0 = (register_0 >> 56) & ((1ULL << 8) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 128); + tmp_0 |= ((register_0) & ((1ULL << 55) - 1)) << 8; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = tmp_0; + tmp_0 = (register_0 >> 55) & ((1ULL << 9) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 144); + tmp_0 |= ((register_0) & ((1ULL << 54) - 1)) << 9; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = tmp_0; + tmp_0 = (register_0 >> 54) & ((1ULL << 10) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 160); + tmp_0 |= ((register_0) & ((1ULL << 53) - 1)) << 10; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = tmp_0; + tmp_0 = (register_0 >> 53) & ((1ULL << 11) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 176); + tmp_0 |= ((register_0) & ((1ULL << 52) - 1)) << 11; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = tmp_0; + tmp_0 = (register_0 >> 52) & ((1ULL << 12) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 192); + tmp_0 |= ((register_0) & ((1ULL << 51) - 1)) << 12; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = tmp_0; + tmp_0 = (register_0 >> 51) & ((1ULL << 13) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 208); + tmp_0 |= ((register_0) & ((1ULL << 50) - 1)) << 13; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = tmp_0; + tmp_0 = (register_0 >> 50) & ((1ULL << 14) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 224); + tmp_0 |= ((register_0) & ((1ULL << 49) - 1)) << 14; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = tmp_0; + tmp_0 = (register_0 >> 49) & ((1ULL << 15) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 240); + tmp_0 |= ((register_0) & ((1ULL << 48) - 1)) << 15; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = tmp_0; + tmp_0 = (register_0 >> 48) & ((1ULL << 16) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 256); + tmp_0 |= ((register_0) & ((1ULL << 47) - 1)) << 16; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = tmp_0; + tmp_0 = (register_0 >> 47) & ((1ULL << 17) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 272); + tmp_0 |= ((register_0) & ((1ULL << 46) - 1)) << 17; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = tmp_0; + tmp_0 = (register_0 >> 46) & ((1ULL << 18) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 288); + tmp_0 |= ((register_0) & ((1ULL << 45) - 1)) << 18; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = tmp_0; + tmp_0 = (register_0 >> 45) & ((1ULL << 19) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 304); + tmp_0 |= ((register_0) & ((1ULL << 44) - 1)) << 19; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = tmp_0; + tmp_0 = (register_0 >> 44) & ((1ULL << 20) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 320); + tmp_0 |= ((register_0) & ((1ULL << 43) - 1)) << 20; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = tmp_0; + tmp_0 = (register_0 >> 43) & ((1ULL << 21) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 336); + tmp_0 |= ((register_0) & ((1ULL << 42) - 1)) << 21; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = tmp_0; + tmp_0 = (register_0 >> 42) & ((1ULL << 22) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 352); + tmp_0 |= ((register_0) & ((1ULL << 41) - 1)) << 22; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = tmp_0; + tmp_0 = (register_0 >> 41) & ((1ULL << 23) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 368); + tmp_0 |= ((register_0) & ((1ULL << 40) - 1)) << 23; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = tmp_0; + tmp_0 = (register_0 >> 40) & ((1ULL << 24) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 384); + tmp_0 |= ((register_0) & ((1ULL << 39) - 1)) << 24; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = tmp_0; + tmp_0 = (register_0 >> 39) & ((1ULL << 25) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 400); + tmp_0 |= ((register_0) & ((1ULL << 38) - 1)) << 25; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = tmp_0; + tmp_0 = (register_0 >> 38) & ((1ULL << 26) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 416); + tmp_0 |= ((register_0) & ((1ULL << 37) - 1)) << 26; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = tmp_0; + tmp_0 = (register_0 >> 37) & ((1ULL << 27) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 432); + tmp_0 |= ((register_0) & ((1ULL << 36) - 1)) << 27; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = tmp_0; + tmp_0 = (register_0 >> 36) & ((1ULL << 28) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 448); + tmp_0 |= ((register_0) & ((1ULL << 35) - 1)) << 28; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = tmp_0; + tmp_0 = (register_0 >> 35) & ((1ULL << 29) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 464); + tmp_0 |= ((register_0) & ((1ULL << 34) - 1)) << 29; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = tmp_0; + tmp_0 = (register_0 >> 34) & ((1ULL << 30) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 480); + tmp_0 |= ((register_0) & ((1ULL << 33) - 1)) << 30; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = tmp_0; + tmp_0 = (register_0 >> 33) & ((1ULL << 31) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 496); + tmp_0 |= ((register_0) & ((1ULL << 32) - 1)) << 31; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = tmp_0; + tmp_0 = (register_0 >> 32) & ((1ULL << 32) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 512); + tmp_0 |= ((register_0) & ((1ULL << 31) - 1)) << 32; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = tmp_0; + tmp_0 = (register_0 >> 31) & ((1ULL << 33) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 528); + tmp_0 |= ((register_0) & ((1ULL << 30) - 1)) << 33; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = tmp_0; + tmp_0 = (register_0 >> 30) & ((1ULL << 34) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 544); + tmp_0 |= ((register_0) & ((1ULL << 29) - 1)) << 34; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = tmp_0; + tmp_0 = (register_0 >> 29) & ((1ULL << 35) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 560); + tmp_0 |= ((register_0) & ((1ULL << 28) - 1)) << 35; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = tmp_0; + tmp_0 = (register_0 >> 28) & ((1ULL << 36) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 576); + tmp_0 |= ((register_0) & ((1ULL << 27) - 1)) << 36; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = tmp_0; + tmp_0 = (register_0 >> 27) & ((1ULL << 37) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 592); + tmp_0 |= ((register_0) & ((1ULL << 26) - 1)) << 37; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = tmp_0; + tmp_0 = (register_0 >> 26) & ((1ULL << 38) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 608); + tmp_0 |= ((register_0) & ((1ULL << 25) - 1)) << 38; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = tmp_0; + tmp_0 = (register_0 >> 25) & ((1ULL << 39) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 624); + tmp_0 |= ((register_0) & ((1ULL << 24) - 1)) << 39; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = tmp_0; + tmp_0 = (register_0 >> 24) & ((1ULL << 40) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 640); + tmp_0 |= ((register_0) & ((1ULL << 23) - 1)) << 40; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = tmp_0; + tmp_0 = (register_0 >> 23) & ((1ULL << 41) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 656); + tmp_0 |= ((register_0) & ((1ULL << 22) - 1)) << 41; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = tmp_0; + tmp_0 = (register_0 >> 22) & ((1ULL << 42) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 672); + tmp_0 |= ((register_0) & ((1ULL << 21) - 1)) << 42; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = tmp_0; + tmp_0 = (register_0 >> 21) & ((1ULL << 43) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 688); + tmp_0 |= ((register_0) & ((1ULL << 20) - 1)) << 43; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = tmp_0; + tmp_0 = (register_0 >> 20) & ((1ULL << 44) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 704); + tmp_0 |= ((register_0) & ((1ULL << 19) - 1)) << 44; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = tmp_0; + tmp_0 = (register_0 >> 19) & ((1ULL << 45) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 720); + tmp_0 |= ((register_0) & ((1ULL << 18) - 1)) << 45; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = tmp_0; + tmp_0 = (register_0 >> 18) & ((1ULL << 46) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 736); + tmp_0 |= ((register_0) & ((1ULL << 17) - 1)) << 46; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = tmp_0; + tmp_0 = (register_0 >> 17) & ((1ULL << 47) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 752); + tmp_0 |= ((register_0) & ((1ULL << 16) - 1)) << 47; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = tmp_0; + tmp_0 = (register_0 >> 16) & ((1ULL << 48) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 768); + tmp_0 |= ((register_0) & ((1ULL << 15) - 1)) << 48; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = tmp_0; + tmp_0 = (register_0 >> 15) & ((1ULL << 49) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 784); + tmp_0 |= ((register_0) & ((1ULL << 14) - 1)) << 49; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = tmp_0; + tmp_0 = (register_0 >> 14) & ((1ULL << 50) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 800); + tmp_0 |= ((register_0) & ((1ULL << 13) - 1)) << 50; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = tmp_0; + tmp_0 = (register_0 >> 13) & ((1ULL << 51) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 816); + tmp_0 |= ((register_0) & ((1ULL << 12) - 1)) << 51; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = tmp_0; + tmp_0 = (register_0 >> 12) & ((1ULL << 52) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 832); + tmp_0 |= ((register_0) & ((1ULL << 11) - 1)) << 52; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = tmp_0; + tmp_0 = (register_0 >> 11) & ((1ULL << 53) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 848); + tmp_0 |= ((register_0) & ((1ULL << 10) - 1)) << 53; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = tmp_0; + tmp_0 = (register_0 >> 10) & ((1ULL << 54) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 864); + tmp_0 |= ((register_0) & ((1ULL << 9) - 1)) << 54; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = tmp_0; + tmp_0 = (register_0 >> 9) & ((1ULL << 55) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 880); + tmp_0 |= ((register_0) & ((1ULL << 8) - 1)) << 55; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = tmp_0; + tmp_0 = (register_0 >> 8) & ((1ULL << 56) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 896); + tmp_0 |= ((register_0) & ((1ULL << 7) - 1)) << 56; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = tmp_0; + tmp_0 = (register_0 >> 7) & ((1ULL << 57) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 912); + tmp_0 |= ((register_0) & ((1ULL << 6) - 1)) << 57; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = tmp_0; + tmp_0 = (register_0 >> 6) & ((1ULL << 58) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 928); + tmp_0 |= ((register_0) & ((1ULL << 5) - 1)) << 58; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = tmp_0; + tmp_0 = (register_0 >> 5) & ((1ULL << 59) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 944); + tmp_0 |= ((register_0) & ((1ULL << 4) - 1)) << 59; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = tmp_0; + tmp_0 = (register_0 >> 4) & ((1ULL << 60) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 960); + tmp_0 |= ((register_0) & ((1ULL << 3) - 1)) << 60; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = tmp_0; + tmp_0 = (register_0 >> 3) & ((1ULL << 61) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 976); + tmp_0 |= ((register_0) & ((1ULL << 2) - 1)) << 61; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = tmp_0; + tmp_0 = (register_0 >> 2) & ((1ULL << 62) - 1); + register_0 = *(in + (0 * 16) + (i * 1) + 992); + tmp_0 |= ((register_0) & ((1ULL << 1) - 1)) << 62; + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = tmp_0; + tmp_0 = (register_0 >> 1) & ((1ULL << 63) - 1); + tmp_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = tmp_0; + } +} +static void unffor_64bw_64ow_64crw_1uf(const uint64_t* __restrict a_in_p, + uint64_t* __restrict a_out_p, + const uint64_t* __restrict a_base_p) { + [[maybe_unused]] auto out = reinterpret_cast(a_out_p); + [[maybe_unused]] const auto in = reinterpret_cast(a_in_p); + [[maybe_unused]] uint64_t register_0; + [[maybe_unused]] uint64_t tmp_0; + [[maybe_unused]] uint64_t base_0 = *(a_base_p); + for (int i = 0; i < 16; ++i) { + register_0 = *(in + (0 * 16) + (i * 1) + 0); + register_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 0)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 16); + register_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 1)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 32); + register_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 2)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 48); + register_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 3)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 64); + register_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 4)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 80); + register_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 5)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 96); + register_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 6)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 112); + register_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 7)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 128); + register_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 8)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 144); + register_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 9)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 160); + register_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 10)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 176); + register_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 11)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 192); + register_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 12)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 208); + register_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 13)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 224); + register_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 14)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 240); + register_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 15)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 256); + register_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 16)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 272); + register_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 17)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 288); + register_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 18)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 304); + register_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 19)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 320); + register_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 20)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 336); + register_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 21)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 352); + register_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 22)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 368); + register_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 23)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 384); + register_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 24)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 400); + register_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 25)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 416); + register_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 26)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 432); + register_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 27)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 448); + register_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 28)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 464); + register_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 29)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 480); + register_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 30)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 496); + register_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 31)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 512); + register_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 32)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 528); + register_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 33)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 544); + register_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 34)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 560); + register_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 35)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 576); + register_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 36)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 592); + register_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 37)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 608); + register_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 38)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 624); + register_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 39)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 640); + register_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 40)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 656); + register_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 41)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 672); + register_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 42)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 688); + register_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 43)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 704); + register_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 44)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 720); + register_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 45)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 736); + register_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 46)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 752); + register_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 47)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 768); + register_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 48)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 784); + register_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 49)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 800); + register_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 50)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 816); + register_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 51)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 832); + register_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 52)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 848); + register_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 53)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 864); + register_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 54)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 880); + register_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 55)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 896); + register_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 56)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 912); + register_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 57)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 928); + register_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 58)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 944); + register_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 59)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 960); + register_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 60)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 976); + register_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 61)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 992); + register_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 62)) = register_0; + register_0 = *(in + (0 * 16) + (i * 1) + 1008); + register_0 += base_0; + *(out + (i * 1) + (0 * 16) + (16 * 63)) = register_0; + } +} +void unffor(const uint8_t* __restrict a_in_p, + uint8_t* __restrict a_out_p, + uint8_t bw, + const uint8_t* __restrict a_base_p) { + switch (bw) { + case 0: + unffor_0bw_8ow_8crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 1: + unffor_1bw_8ow_8crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 2: + unffor_2bw_8ow_8crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 3: + unffor_3bw_8ow_8crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 4: + unffor_4bw_8ow_8crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 5: + unffor_5bw_8ow_8crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 6: + unffor_6bw_8ow_8crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 7: + unffor_7bw_8ow_8crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 8: + unffor_8bw_8ow_8crw_1uf(a_in_p, a_out_p, a_base_p); + break; + } +} +void unffor(const uint16_t* __restrict a_in_p, + uint16_t* __restrict a_out_p, + uint8_t bw, + const uint16_t* __restrict a_base_p) { + switch (bw) { + case 0: + unffor_0bw_16ow_16crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 1: + unffor_1bw_16ow_16crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 2: + unffor_2bw_16ow_16crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 3: + unffor_3bw_16ow_16crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 4: + unffor_4bw_16ow_16crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 5: + unffor_5bw_16ow_16crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 6: + unffor_6bw_16ow_16crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 7: + unffor_7bw_16ow_16crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 8: + unffor_8bw_16ow_16crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 9: + unffor_9bw_16ow_16crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 10: + unffor_10bw_16ow_16crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 11: + unffor_11bw_16ow_16crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 12: + unffor_12bw_16ow_16crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 13: + unffor_13bw_16ow_16crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 14: + unffor_14bw_16ow_16crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 15: + unffor_15bw_16ow_16crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 16: + unffor_16bw_16ow_16crw_1uf(a_in_p, a_out_p, a_base_p); + break; + } +} +void unffor(const uint32_t* __restrict a_in_p, + uint32_t* __restrict a_out_p, + uint8_t bw, + const uint32_t* __restrict a_base_p) { + switch (bw) { + case 0: + unffor_0bw_32ow_32crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 1: + unffor_1bw_32ow_32crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 2: + unffor_2bw_32ow_32crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 3: + unffor_3bw_32ow_32crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 4: + unffor_4bw_32ow_32crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 5: + unffor_5bw_32ow_32crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 6: + unffor_6bw_32ow_32crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 7: + unffor_7bw_32ow_32crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 8: + unffor_8bw_32ow_32crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 9: + unffor_9bw_32ow_32crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 10: + unffor_10bw_32ow_32crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 11: + unffor_11bw_32ow_32crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 12: + unffor_12bw_32ow_32crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 13: + unffor_13bw_32ow_32crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 14: + unffor_14bw_32ow_32crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 15: + unffor_15bw_32ow_32crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 16: + unffor_16bw_32ow_32crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 17: + unffor_17bw_32ow_32crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 18: + unffor_18bw_32ow_32crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 19: + unffor_19bw_32ow_32crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 20: + unffor_20bw_32ow_32crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 21: + unffor_21bw_32ow_32crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 22: + unffor_22bw_32ow_32crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 23: + unffor_23bw_32ow_32crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 24: + unffor_24bw_32ow_32crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 25: + unffor_25bw_32ow_32crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 26: + unffor_26bw_32ow_32crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 27: + unffor_27bw_32ow_32crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 28: + unffor_28bw_32ow_32crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 29: + unffor_29bw_32ow_32crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 30: + unffor_30bw_32ow_32crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 31: + unffor_31bw_32ow_32crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 32: + unffor_32bw_32ow_32crw_1uf(a_in_p, a_out_p, a_base_p); + break; + } +} +void unffor(const uint64_t* __restrict a_in_p, + uint64_t* __restrict a_out_p, + uint8_t bw, + const uint64_t* __restrict a_base_p) { + switch (bw) { + case 0: + unffor_0bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 1: + unffor_1bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 2: + unffor_2bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 3: + unffor_3bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 4: + unffor_4bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 5: + unffor_5bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 6: + unffor_6bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 7: + unffor_7bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 8: + unffor_8bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 9: + unffor_9bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 10: + unffor_10bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 11: + unffor_11bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 12: + unffor_12bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 13: + unffor_13bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 14: + unffor_14bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 15: + unffor_15bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 16: + unffor_16bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 17: + unffor_17bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 18: + unffor_18bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 19: + unffor_19bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 20: + unffor_20bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 21: + unffor_21bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 22: + unffor_22bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 23: + unffor_23bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 24: + unffor_24bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 25: + unffor_25bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 26: + unffor_26bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 27: + unffor_27bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 28: + unffor_28bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 29: + unffor_29bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 30: + unffor_30bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 31: + unffor_31bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 32: + unffor_32bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 33: + unffor_33bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 34: + unffor_34bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 35: + unffor_35bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 36: + unffor_36bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 37: + unffor_37bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 38: + unffor_38bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 39: + unffor_39bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 40: + unffor_40bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 41: + unffor_41bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 42: + unffor_42bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 43: + unffor_43bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 44: + unffor_44bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 45: + unffor_45bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 46: + unffor_46bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 47: + unffor_47bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 48: + unffor_48bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 49: + unffor_49bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 50: + unffor_50bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 51: + unffor_51bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 52: + unffor_52bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 53: + unffor_53bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 54: + unffor_54bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 55: + unffor_55bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 56: + unffor_56bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 57: + unffor_57bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 58: + unffor_58bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 59: + unffor_59bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 60: + unffor_60bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 61: + unffor_61bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 62: + unffor_62bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 63: + unffor_63bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p); + break; + case 64: + unffor_64bw_64ow_64crw_1uf(a_in_p, a_out_p, a_base_p); + break; + } +} +}}}} // namespace fastlanes::generated::unffor::fallback::scalar diff --git a/src/fastlanes_unffor.cpp b/src/fastlanes_unffor.cpp new file mode 100644 index 0000000..d8e3332 --- /dev/null +++ b/src/fastlanes_unffor.cpp @@ -0,0 +1,36 @@ +#include "fastlanes/unffor.hpp" + +namespace fastlanes::generated::unffor::fallback::scalar { + +void unffor(const int64_t* __restrict in, int64_t* __restrict out, uint8_t bw, const int64_t* __restrict a_base_p) { + auto const* in_u = reinterpret_cast(in); + auto* out_u = reinterpret_cast(out); + auto const* base_u = reinterpret_cast(a_base_p); + + unffor(in_u, out_u, bw, base_u); +} + +void unffor(const int32_t* __restrict in, int32_t* __restrict out, uint8_t bw, const int32_t* __restrict a_base_p) { + auto const* in_u = reinterpret_cast(in); + auto* out_u = reinterpret_cast(out); + auto const* base_u = reinterpret_cast(a_base_p); + + unffor(in_u, out_u, bw, base_u); +} + +void unffor(const int16_t* __restrict in, int16_t* __restrict out, uint8_t bw, const int16_t* __restrict a_base_p) { + auto const* in_u = reinterpret_cast(in); + auto* out_u = reinterpret_cast(out); + auto const* base_u = reinterpret_cast(a_base_p); + + unffor(in_u, out_u, bw, base_u); +} + +void unffor(const int8_t* __restrict in, int8_t* __restrict out, uint8_t bw, const int8_t* __restrict a_base_p) { + auto const* in_u = reinterpret_cast(in); + auto* out_u = reinterpret_cast(out); + auto const* base_u = reinterpret_cast(a_base_p); + + unffor(in_u, out_u, bw, base_u); +} +} // namespace fastlanes::generated::unffor::fallback::scalar \ No newline at end of file diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt new file mode 100644 index 0000000..4d576c1 --- /dev/null +++ b/test/CMakeLists.txt @@ -0,0 +1,3 @@ +add_executable(test_alp_sample test_alp_sample.cpp) +target_link_libraries(test_alp_sample PUBLIC gtest_main ALP) +gtest_discover_tests(test_alp_sample) diff --git a/test/include/test/mapper.hpp b/test/include/test/mapper.hpp new file mode 100644 index 0000000..901bb3e --- /dev/null +++ b/test/include/test/mapper.hpp @@ -0,0 +1,27 @@ +#ifndef TEST_MAPPER_HPP +#define TEST_MAPPER_HPP + +#include +#include +#include +#include +#include +#include + +namespace mapper { + +template +inline T* mmap_file(size_t& n_values, const std::string& filename) { + struct stat file_stats; + + const int fd = ::open(filename.c_str(), O_RDONLY); + fstat(fd, &file_stats); + const size_t file_size = file_stats.st_size; + auto* file_pointer = static_cast(mmap(0, file_size, PROT_READ, MAP_PRIVATE, fd, 0)); + n_values = file_size / sizeof(T); + return file_pointer; +} + +} // namespace mapper + +#endif \ No newline at end of file diff --git a/test/test_alp_sample.cpp b/test/test_alp_sample.cpp new file mode 100644 index 0000000..959eb5e --- /dev/null +++ b/test/test_alp_sample.cpp @@ -0,0 +1,188 @@ +#include "alp.hpp" +#include "data.hpp" +#include "test/mapper.hpp" +#include "gtest/gtest.h" + +using namespace alp::config; + +/// ALP encoded size per vector = bit_width + factor-idx + exponent-idx + ffor base; +double overhead_per_vector {static_cast(8 + 8 + 8 + 64) / VECTOR_SIZE}; + +/// ALP_RD Overhead encoded size +double alprd_overhead_per_vector {static_cast(MAX_RD_DICTIONARY_SIZE * 16) / ROWGROUP_SIZE}; + +namespace test { +template +void ALP_ASSERT(T orginal_val, T decoded_val) { + if (orginal_val == 0.0 && std::signbit(orginal_val)) { + ASSERT_EQ(decoded_val, 0.0); + ASSERT_TRUE(std::signbit(decoded_val)); + } else if (std::isnan(orginal_val)) { + ASSERT_TRUE(std::isnan(decoded_val)); + } else { + ASSERT_EQ(orginal_val, decoded_val); + } +} +} // namespace test +class alp_test : public ::testing::Test { +public: + double* dbl_arr {}; + double* exc_arr {}; + uint16_t* rd_exc_arr {}; + uint16_t* pos_arr {}; + uint16_t* exc_c_arr {}; + int64_t* ffor_arr {}; + int64_t* unffor_arr {}; + int64_t* base_arr {}; + int64_t* encoded_arr {}; + double* dec_dbl_arr {}; + double* smp_arr {}; + uint64_t* ffor_right_arr {}; + uint16_t* ffor_left_arr {}; + uint64_t* right_arr {}; + uint16_t* left_arr {}; + uint64_t* unffor_right_arr {}; + uint16_t* unffor_left_arr {}; + double* glue_arr {}; + + alp::state state; + + alp::bw_t bit_width {}; + + void SetUp() override { + dbl_arr = new double[VECTOR_SIZE]; + exc_arr = new double[VECTOR_SIZE]; + rd_exc_arr = new uint16_t[VECTOR_SIZE]; + pos_arr = new uint16_t[VECTOR_SIZE]; + encoded_arr = new int64_t[VECTOR_SIZE]; + dec_dbl_arr = new double[VECTOR_SIZE]; + exc_c_arr = new uint16_t[VECTOR_SIZE]; + ffor_arr = new int64_t[VECTOR_SIZE]; + unffor_arr = new int64_t[VECTOR_SIZE]; + base_arr = new int64_t[VECTOR_SIZE]; + smp_arr = new double[VECTOR_SIZE]; + right_arr = new uint64_t[VECTOR_SIZE]; + left_arr = new uint16_t[VECTOR_SIZE]; + ffor_right_arr = new uint64_t[VECTOR_SIZE]; + ffor_left_arr = new uint16_t[VECTOR_SIZE]; + unffor_right_arr = new uint64_t[VECTOR_SIZE]; + unffor_left_arr = new uint16_t[VECTOR_SIZE]; + glue_arr = new double[VECTOR_SIZE]; + } + + ~alp_test() override { + delete[] dbl_arr; + delete[] exc_arr; + delete[] rd_exc_arr; + delete[] pos_arr; + delete[] encoded_arr; + delete[] dec_dbl_arr; + delete[] exc_c_arr; + delete[] ffor_arr; + delete[] unffor_arr; + delete[] base_arr; + delete[] smp_arr; + delete[] right_arr; + delete[] left_arr; + delete[] unffor_right_arr; + delete[] unffor_left_arr; + } + + void test_column(const alp_bench::Column& column) { + std::ifstream file(column.sample_csv_file_path, std::ios::in); + if (!file) throw std::runtime_error(column.sample_csv_file_path + " : " + strerror(errno)); + + alp::state stt; + size_t tuples_count {VECTOR_SIZE}; + size_t rowgroup_offset {0}; + + double value_to_encode; + std::string val_str; + // keep storing values from the text file so long as data exists: + size_t vector_idx {0}; + while (file >> val_str) { + value_to_encode = std::stod(val_str); + dbl_arr[vector_idx] = value_to_encode; + + vector_idx += 1; + } + + // Init + alp::AlpEncode::init(dbl_arr, rowgroup_offset, tuples_count, smp_arr, stt); + + switch (stt.scheme) { + case alp::SCHEME::ALP_RD: { + alp::AlpRD::init(dbl_arr, rowgroup_offset, tuples_count, smp_arr, stt); + + alp::AlpRD::encode(dbl_arr, rd_exc_arr, pos_arr, exc_c_arr, right_arr, left_arr, stt); + ffor::ffor(right_arr, ffor_right_arr, stt.right_bit_width, &stt.right_for_base); + ffor::ffor(left_arr, ffor_left_arr, stt.left_bit_width, &stt.left_for_base); + + // Decode + unffor::unffor(ffor_right_arr, unffor_right_arr, stt.right_bit_width, &stt.right_for_base); + unffor::unffor(ffor_left_arr, unffor_left_arr, stt.left_bit_width, &stt.left_for_base); + alp::AlpRD::decode( + glue_arr, unffor_right_arr, unffor_left_arr, rd_exc_arr, pos_arr, exc_c_arr, stt); + + size_t vector_idx {0}; + for (size_t i = 0; i < VECTOR_SIZE; ++i) { + auto l = dbl_arr[i]; + auto r = glue_arr[i]; + if (l != r) { std::cout << vector_idx++ << " | " << i << " r : " << r << " l : " << l << std::endl; } + test::ALP_ASSERT(r, l); + } + + break; + } + case alp::SCHEME::ALP: { // Encode + alp::AlpEncode::encode(dbl_arr, exc_arr, pos_arr, exc_c_arr, encoded_arr, stt); + alp::AlpEncode::analyze_ffor(encoded_arr, bit_width, base_arr); + ffor::ffor(encoded_arr, ffor_arr, bit_width, base_arr); + + // Decode + generated::falp::fallback::scalar::falp(reinterpret_cast(ffor_arr), + dec_dbl_arr, + bit_width, + reinterpret_cast(base_arr), + stt.fac, + stt.exp); + alp::AlpDecode::patch_exceptions(dec_dbl_arr, exc_arr, pos_arr, exc_c_arr); + + auto exceptions_count = exc_c_arr[0]; + + for (size_t i = 0; i < VECTOR_SIZE; ++i) { + test::ALP_ASSERT(dbl_arr[i], dec_dbl_arr[i]); + } + + ASSERT_EQ(column.exceptions_count, exceptions_count); + ASSERT_EQ(column.bit_width, bit_width); + } + } + + std::cout << "Testing ALP on one vector on dataset: " << column.name << " OK" << std::endl; + + file.close(); + } +}; + +/// Test used for correctness of bitwidth and exceptions on the first vector of each dataset +TEST_F(alp_test, test_alp) { + for (const auto& col : alp_bench::alp_dataset) { + test_column(col); + } +} + +/// Test used for correctness of bitwidth and exceptions on the first vector of generated data +TEST_F(alp_test, test_alp_on_generated) { + for (const auto& col : alp_bench::generated_cols) { + if (col.bit_width > 42) { continue; } + test_column(col); + } +} + +// Test used for correctness of bitwidth and exceptions on the first vector of edge_case data +TEST_F(alp_test, test_alp_on_edge_case) { + for (const auto& col : alp_bench::edge_case) { + test_column(col); + } +} diff --git a/toolchain/example.cmake b/toolchain/example.cmake new file mode 100644 index 0000000..65df82a --- /dev/null +++ b/toolchain/example.cmake @@ -0,0 +1,5 @@ +set(CMAKE_SYSTEM_NAME Linux) +set(CMAKE_SYSTEM_PROCESSOR x86_64) + +set(CMAKE_C_COMPILER /usr/bin/clang) +set(CMAKE_CXX_COMPILER /usr/bin/clang++) \ No newline at end of file diff --git a/toolchain/m1.cmake b/toolchain/m1.cmake new file mode 100644 index 0000000..1498c7d --- /dev/null +++ b/toolchain/m1.cmake @@ -0,0 +1,6 @@ +set(CMAKE_SYSTEM_NAME Darwin) +set(CMAKE_SYSTEM_PROCESSOR arm64) + +set(CMAKE_C_COMPILER "/Library/Developer/CommandLineTools/usr/bin/cc") +set(CMAKE_CXX_COMPILER "/Library/Developer/CommandLineTools/usr/bin/c++") +set(CMAKE_LINKER "/Library/Developer/CommandLineTools/usr/bin/ld") \ No newline at end of file